Chapter 12: Data processing and analysis with pandas

Robert Johansson

Source code listings for Numerical Python - A Practical Techniques Approach for Industry (ISBN 978-1-484205-54-9).

The source code listings can be downloaded from http://www.apress.com/9781484205549

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
import numpy as np
In [3]:
import pandas as pd
In [4]:
pd.set_option('display.mpl_style', 'default')

Series object

In [5]:
s = pd.Series([909976, 8615246, 2872086, 2273305])
In [6]:
s
Out[6]:
0     909976
1    8615246
2    2872086
3    2273305
dtype: int64
In [7]:
type(s)
Out[7]:
pandas.core.series.Series
In [8]:
s.dtype
Out[8]:
dtype('int64')
In [9]:
s.index
Out[9]:
Int64Index([0, 1, 2, 3], dtype='int64')
In [10]:
s.values
Out[10]:
array([ 909976, 8615246, 2872086, 2273305])
In [11]:
s.index = ["Stockholm", "London", "Rome", "Paris"]
In [12]:
s.name = "Population"
In [13]:
s
Out[13]:
Stockholm     909976
London       8615246
Rome         2872086
Paris        2273305
Name: Population, dtype: int64
In [14]:
s = pd.Series([909976, 8615246, 2872086, 2273305], 
              index=["Stockholm", "London", "Rome", "Paris"], name="Population")
In [15]:
s["London"]
Out[15]:
8615246
In [16]:
s.Stockholm
Out[16]:
909976
In [17]:
s[["Paris", "Rome"]]
Out[17]:
Paris    2273305
Rome     2872086
Name: Population, dtype: int64
In [18]:
s.median(), s.mean(), s.std()
Out[18]:
(2572695.5, 3667653.25, 3399048.5005155364)
In [19]:
s.min(), s.max()
Out[19]:
(909976, 8615246)
In [20]:
s.quantile(q=0.25), s.quantile(q=0.5), s.quantile(q=0.75)
Out[20]:
(1932472.75, 2572695.5, 4307876.0)
In [21]:
s.describe()
Out[21]:
count          4.000000
mean     3667653.250000
std      3399048.500516
min       909976.000000
25%      1932472.750000
50%      2572695.500000
75%      4307876.000000
max      8615246.000000
Name: Population, dtype: float64
In [22]:
fig, axes = plt.subplots(1, 4, figsize=(12, 3))

s.plot(ax=axes[0], kind='line', title="line")
s.plot(ax=axes[1], kind='bar', title="bar")
s.plot(ax=axes[2], kind='box', title="box")
s.plot(ax=axes[3], kind='pie', title="pie")

fig.tight_layout()
fig.savefig("ch12-series-plot.pdf")
fig.savefig("ch12-series-plot.png")

DataFrame object

In [23]:
df = pd.DataFrame([[909976, 8615246, 2872086, 2273305],
                   ["Sweden", "United kingdom", "Italy", "France"]])
In [24]:
df
Out[24]:
0 1 2 3
0 909976 8615246 2872086 2273305
1 Sweden United kingdom Italy France
In [25]:
df = pd.DataFrame([[909976, "Sweden"],
                   [8615246, "United kingdom"], 
                   [2872086, "Italy"],
                   [2273305, "France"]])
In [26]:
df
Out[26]:
0 1
0 909976 Sweden
1 8615246 United kingdom
2 2872086 Italy
3 2273305 France
In [27]:
df.index = ["Stockholm", "London", "Rome", "Paris"]
In [28]:
df.columns = ["Population", "State"]
In [29]:
df
Out[29]:
Population State
Stockholm 909976 Sweden
London 8615246 United kingdom
Rome 2872086 Italy
Paris 2273305 France
In [30]:
df = pd.DataFrame([[909976, "Sweden"],
                   [8615246, "United kingdom"], 
                   [2872086, "Italy"],
                   [2273305, "France"]],
                  index=["Stockholm", "London", "Rome", "Paris"],
                  columns=["Population", "State"])
In [31]:
df
Out[31]:
Population State
Stockholm 909976 Sweden
London 8615246 United kingdom
Rome 2872086 Italy
Paris 2273305 France
In [32]:
df = pd.DataFrame({"Population": [909976, 8615246, 2872086, 2273305],
                   "State": ["Sweden", "United kingdom", "Italy", "France"]},
                  index=["Stockholm", "London", "Rome", "Paris"])
In [33]:
df
Out[33]:
Population State
Stockholm 909976 Sweden
London 8615246 United kingdom
Rome 2872086 Italy
Paris 2273305 France
In [34]:
df.index
Out[34]:
Index([u'Stockholm', u'London', u'Rome', u'Paris'], dtype='object')
In [35]:
df.columns
Out[35]:
Index([u'Population', u'State'], dtype='object')
In [36]:
df.values
Out[36]:
array([[909976, 'Sweden'],
       [8615246, 'United kingdom'],
       [2872086, 'Italy'],
       [2273305, 'France']], dtype=object)
In [37]:
df.Population
Out[37]:
Stockholm     909976
London       8615246
Rome         2872086
Paris        2273305
Name: Population, dtype: int64
In [38]:
df["Population"]
Out[38]:
Stockholm     909976
London       8615246
Rome         2872086
Paris        2273305
Name: Population, dtype: int64
In [39]:
type(df.Population)
Out[39]:
pandas.core.series.Series
In [40]:
df.Population.Stockholm
Out[40]:
909976
In [41]:
type(df.ix)
Out[41]:
pandas.core.indexing._IXIndexer
In [42]:
df.ix["Stockholm"]
Out[42]:
Population    909976
State         Sweden
Name: Stockholm, dtype: object
In [43]:
type(df.ix["Stockholm"])
Out[43]:
pandas.core.series.Series
In [44]:
df.ix[["Paris", "Rome"]]
Out[44]:
Population State
Paris 2273305 France
Rome 2872086 Italy
In [45]:
df.ix[["Paris", "Rome"], "Population"]
Out[45]:
Paris    2273305
Rome     2872086
Name: Population, dtype: int64
In [46]:
df.ix["Paris", "Population"]
Out[46]:
2273305
In [47]:
df.mean()
Out[47]:
Population    3667653.25
dtype: float64
In [48]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Stockholm to Paris
Data columns (total 2 columns):
Population    4 non-null int64
State         4 non-null object
dtypes: int64(1), object(1)
memory usage: 96.0+ bytes
In [49]:
df.dtypes
Out[49]:
Population     int64
State         object
dtype: object
In [50]:
df.head()
Out[50]:
Population State
Stockholm 909976 Sweden
London 8615246 United kingdom
Rome 2872086 Italy
Paris 2273305 France
In [51]:
!head -n5 /home/rob/datasets/european_cities.csv
Rank,City,State,Official population,Date of census/estimate
1,London[2], United Kingdom,"8,615,246",1 June 2014
2,Berlin, Germany,"3,437,916",31 May 2014
3,Madrid, Spain,"3,165,235",1 January 2014
4,Rome, Italy,"2,872,086",30 September 2014

Larger dataset

In [52]:
df_pop = pd.read_csv("european_cities.csv")
In [53]:
df_pop.head()
Out[53]:
Rank City State Population Date of census/estimate
0 1 London[2] United Kingdom 8,615,246 1 June 2014
1 2 Berlin Germany 3,437,916 31 May 2014
2 3 Madrid Spain 3,165,235 1 January 2014
3 4 Rome Italy 2,872,086 30 September 2014
4 5 Paris France 2,273,305 1 January 2013
In [54]:
df_pop = pd.read_csv("european_cities.csv", delimiter=",", encoding="utf-8", header=0)
In [55]:
df_pop.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 105 entries, 0 to 104
Data columns (total 5 columns):
Rank                       105 non-null int64
City                       105 non-null object
State                      105 non-null object
Population                 105 non-null object
Date of census/estimate    105 non-null object
dtypes: int64(1), object(4)
memory usage: 4.9+ KB
In [56]:
df_pop.head()
Out[56]:
Rank City State Population Date of census/estimate
0 1 London[2] United Kingdom 8,615,246 1 June 2014
1 2 Berlin Germany 3,437,916 31 May 2014
2 3 Madrid Spain 3,165,235 1 January 2014
3 4 Rome Italy 2,872,086 30 September 2014
4 5 Paris France 2,273,305 1 January 2013
In [57]:
df_pop["NumericPopulation"] = df_pop.Population.apply(lambda x: int(x.replace(",", "")))
In [58]:
df_pop["State"].values[:3]
Out[58]:
array([u' United Kingdom', u' Germany', u' Spain'], dtype=object)
In [59]:
df_pop["State"] = df_pop["State"].apply(lambda x: x.strip())
In [60]:
df_pop.head()
Out[60]:
Rank City State Population Date of census/estimate NumericPopulation
0 1 London[2] United Kingdom 8,615,246 1 June 2014 8615246
1 2 Berlin Germany 3,437,916 31 May 2014 3437916
2 3 Madrid Spain 3,165,235 1 January 2014 3165235
3 4 Rome Italy 2,872,086 30 September 2014 2872086
4 5 Paris France 2,273,305 1 January 2013 2273305
In [61]:
df_pop.dtypes
Out[61]:
Rank                        int64
City                       object
State                      object
Population                 object
Date of census/estimate    object
NumericPopulation           int64
dtype: object
In [62]:
df_pop2 = df_pop.set_index("City")
In [63]:
df_pop2 = df_pop2.sort_index()
In [64]:
df_pop2.head()
Out[64]:
Rank State Population Date of census/estimate NumericPopulation
City
Aarhus 92 Denmark 326,676 1 October 2014 326676
Alicante 86 Spain 334,678 1 January 2012 334678
Amsterdam 23 Netherlands 813,562 31 May 2014 813562
Antwerp 59 Belgium 510,610 1 January 2014 510610
Athens 34 Greece 664,046 24 May 2011 664046
In [65]:
df_pop2.head()
Out[65]:
Rank State Population Date of census/estimate NumericPopulation
City
Aarhus 92 Denmark 326,676 1 October 2014 326676
Alicante 86 Spain 334,678 1 January 2012 334678
Amsterdam 23 Netherlands 813,562 31 May 2014 813562
Antwerp 59 Belgium 510,610 1 January 2014 510610
Athens 34 Greece 664,046 24 May 2011 664046
In [66]:
df_pop3 = df_pop.set_index(["State", "City"]).sortlevel(0)
In [67]:
df_pop3.head(7)
Out[67]:
Rank Population Date of census/estimate NumericPopulation
State City
Austria Vienna 7 1,794,770 1 January 2015 1794770
Belgium Antwerp 59 510,610 1 January 2014 510610
Brussels[17] 16 1,175,831 1 January 2014 1175831
Bulgaria Plovdiv 84 341,041 31 December 2013 341041
Sofia 14 1,291,895 14 December 2014 1291895
Varna 85 335,819 31 December 2013 335819
Croatia Zagreb 24 790,017 31 March 2011 790017
In [68]:
df_pop3.ix["Sweden"]
Out[68]:
Rank Population Date of census/estimate NumericPopulation
City
Gothenburg 53 528,014 31 March 2013 528014
Malmö 102 309,105 31 March 2013 309105
Stockholm 20 909,976 31 January 2014 909976
In [69]:
df_pop3.ix[("Sweden", "Gothenburg")]
Out[69]:
Rank                                  53
Population                       528,014
Date of census/estimate    31 March 2013
NumericPopulation                 528014
Name: (Sweden, Gothenburg), dtype: object
In [70]:
df_pop.set_index("City").sort(["State", "NumericPopulation"], ascending=[False, True]).head()
Out[70]:
Rank State Population Date of census/estimate NumericPopulation
City
Nottingham 103 United Kingdom 308,735 30 June 2012 308735
Wirral 97 United Kingdom 320,229 30 June 2012 320229
Coventry 94 United Kingdom 323,132 30 June 2012 323132
Wakefield 91 United Kingdom 327,627 30 June 2012 327627
Leicester 87 United Kingdom 331,606 30 June 2012 331606
In [71]:
city_counts = df_pop.State.value_counts()
In [72]:
city_counts.name = "# cities in top 105"
In [73]:
df_pop3 = df_pop[["State", "City", "NumericPopulation"]].set_index(["State", "City"])
In [74]:
df_pop4 = df_pop3.sum(level="State").sort("NumericPopulation", ascending=False)
In [75]:
df_pop4.head()
Out[75]:
NumericPopulation
State
United Kingdom 16011877
Germany 15119548
Spain 10041639
Italy 8764067
Poland 6267409
In [76]:
df_pop5 = (df_pop.drop("Rank", axis=1)
                 .groupby("State").sum()
                 .sort("NumericPopulation", ascending=False))
In [77]:
df_pop5.head()
Out[77]:
NumericPopulation
State
United Kingdom 16011877
Germany 15119548
Spain 10041639
Italy 8764067
Poland 6267409
In [78]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

city_counts.plot(kind='barh', ax=ax1)
ax1.set_xlabel("# cities in top 105")
df_pop5.NumericPopulation.plot(kind='barh', ax=ax2)
ax2.set_xlabel("Total pop. in top 105 cities")

fig.tight_layout()
fig.savefig("ch12-state-city-counts-sum.pdf")

Time series

Basics

In [79]:
import datetime
In [80]:
pd.date_range("2015-1-1", periods=31)
Out[80]:
DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05', '2015-01-06', '2015-01-07', '2015-01-08',
               '2015-01-09', '2015-01-10', '2015-01-11', '2015-01-12',
               '2015-01-13', '2015-01-14', '2015-01-15', '2015-01-16',
               '2015-01-17', '2015-01-18', '2015-01-19', '2015-01-20',
               '2015-01-21', '2015-01-22', '2015-01-23', '2015-01-24',
               '2015-01-25', '2015-01-26', '2015-01-27', '2015-01-28',
               '2015-01-29', '2015-01-30', '2015-01-31'],
              dtype='datetime64[ns]', freq='D', tz=None)
In [81]:
pd.date_range(datetime.datetime(2015, 1, 1), periods=31)
Out[81]:
DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05', '2015-01-06', '2015-01-07', '2015-01-08',
               '2015-01-09', '2015-01-10', '2015-01-11', '2015-01-12',
               '2015-01-13', '2015-01-14', '2015-01-15', '2015-01-16',
               '2015-01-17', '2015-01-18', '2015-01-19', '2015-01-20',
               '2015-01-21', '2015-01-22', '2015-01-23', '2015-01-24',
               '2015-01-25', '2015-01-26', '2015-01-27', '2015-01-28',
               '2015-01-29', '2015-01-30', '2015-01-31'],
              dtype='datetime64[ns]', freq='D', tz=None)
In [82]:
pd.date_range("2015-1-1 00:00", "2015-1-1 12:00", freq="H")
Out[82]:
DatetimeIndex(['2015-01-01 00:00:00', '2015-01-01 01:00:00',
               '2015-01-01 02:00:00', '2015-01-01 03:00:00',
               '2015-01-01 04:00:00', '2015-01-01 05:00:00',
               '2015-01-01 06:00:00', '2015-01-01 07:00:00',
               '2015-01-01 08:00:00', '2015-01-01 09:00:00',
               '2015-01-01 10:00:00', '2015-01-01 11:00:00',
               '2015-01-01 12:00:00'],
              dtype='datetime64[ns]', freq='H', tz=None)
In [83]:
ts1 = pd.Series(np.arange(31), index=pd.date_range("2015-1-1", periods=31))
In [84]:
ts1.head()
Out[84]:
2015-01-01    0
2015-01-02    1
2015-01-03    2
2015-01-04    3
2015-01-05    4
Freq: D, dtype: int64
In [85]:
ts1["2015-1-3"]
Out[85]:
2
In [86]:
ts1.index[2]
Out[86]:
Timestamp('2015-01-03 00:00:00', offset='D')
In [87]:
ts1.index[2].year, ts1.index[2].month, ts1.index[2].day
Out[87]:
(2015, 1, 3)
In [88]:
ts1.index[2].nanosecond
Out[88]:
0
In [89]:
ts1.index[2].to_pydatetime()
Out[89]:
datetime.datetime(2015, 1, 3, 0, 0)
In [90]:
ts2 = pd.Series(np.random.rand(2), 
                index=[datetime.datetime(2015, 1, 1), datetime.datetime(2015, 2, 1)])
In [91]:
ts2
Out[91]:
2015-01-01    0.323774
2015-02-01    0.636933
dtype: float64
In [92]:
periods = pd.PeriodIndex([pd.Period('2015-01'), pd.Period('2015-02'), pd.Period('2015-03')])
In [93]:
ts3 = pd.Series(np.random.rand(3), periods)
In [94]:
ts3
Out[94]:
2015-01    0.023390
2015-02    0.998680
2015-03    0.451616
Freq: M, dtype: float64
In [95]:
ts3.index
Out[95]:
PeriodIndex(['2015-01', '2015-02', '2015-03'], dtype='int64', freq='M')
In [96]:
ts2.to_period('M')
Out[96]:
2015-01    0.323774
2015-02    0.636933
Freq: M, dtype: float64
In [97]:
pd.date_range("2015-1-1", periods=12, freq="M").to_period()
Out[97]:
PeriodIndex(['2015-01', '2015-02', '2015-03', '2015-04', '2015-05', '2015-06',
             '2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12'],
            dtype='int64', freq='M')

Temperature time series example

In [98]:
!head -n 5 temperature_outdoor_2014.tsv
1388530986	4.380000
1388531586	4.250000
1388532187	4.190000
1388532787	4.060000
1388533388	4.060000
In [99]:
df1 = pd.read_csv('temperature_outdoor_2014.tsv', delimiter="\t", names=["time", "outdoor"])
In [100]:
df2 = pd.read_csv('temperature_indoor_2014.tsv', delimiter="\t", names=["time", "indoor"])
In [101]:
df1.head()
Out[101]:
time outdoor
0 1388530986 4.38
1 1388531586 4.25
2 1388532187 4.19
3 1388532787 4.06
4 1388533388 4.06
In [102]:
df2.head()
Out[102]:
time indoor
0 1388530986 21.94
1 1388531586 22.00
2 1388532187 22.00
3 1388532787 22.00
4 1388533388 22.00
In [103]:
df1.time = (pd.to_datetime(df1.time.values, unit="s")
              .tz_localize('UTC').tz_convert('Europe/Stockholm'))
In [104]:
df1 = df1.set_index("time")
In [105]:
df2.time = (pd.to_datetime(df2.time.values, unit="s")
              .tz_localize('UTC').tz_convert('Europe/Stockholm'))
In [106]:
df2 = df2.set_index("time")
In [107]:
df1.head()
Out[107]:
outdoor
time
2014-01-01 00:03:06+01:00 4.38
2014-01-01 00:13:06+01:00 4.25
2014-01-01 00:23:07+01:00 4.19
2014-01-01 00:33:07+01:00 4.06
2014-01-01 00:43:08+01:00 4.06
In [108]:
df1.index[0]
Out[108]:
Timestamp('2014-01-01 00:03:06+0100', tz='Europe/Stockholm')
In [109]:
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
df1.plot(ax=ax)
df2.plot(ax=ax)

fig.tight_layout()
fig.savefig("ch12-timeseries-temperature-2014.pdf")
In [110]:
# select january data
In [111]:
df1.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49548 entries, 2014-01-01 00:03:06+01:00 to 2014-12-30 23:56:35+01:00
Data columns (total 1 columns):
outdoor    49548 non-null float64
dtypes: float64(1)
memory usage: 774.2 KB
In [112]:
df1_jan = df1[(df1.index > "2014-1-1") & (df1.index < "2014-2-1")]
In [113]:
df1.index < "2014-2-1"
Out[113]:
array([ True,  True,  True, ..., False, False, False], dtype=bool)
In [114]:
df1_jan.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4452 entries, 2014-01-01 00:03:06+01:00 to 2014-01-31 23:56:58+01:00
Data columns (total 1 columns):
outdoor    4452 non-null float64
dtypes: float64(1)
memory usage: 69.6 KB
In [115]:
df2_jan = df2["2014-1-1":"2014-1-31"]
In [116]:
fig, ax = plt.subplots(1, 1, figsize=(12, 4))

df1_jan.plot(ax=ax)
df2_jan.plot(ax=ax)

fig.tight_layout()
fig.savefig("ch12-timeseries-selected-month.pdf")
In [117]:
# group by month
In [118]:
df1_month = df1.reset_index()
In [119]:
df1_month["month"] = df1_month.time.apply(lambda x: x.month)
In [120]:
df1_month.head()
Out[120]:
time outdoor month
0 2014-01-01 00:03:06+01:00 4.38 1
1 2014-01-01 00:13:06+01:00 4.25 1
2 2014-01-01 00:23:07+01:00 4.19 1
3 2014-01-01 00:33:07+01:00 4.06 1
4 2014-01-01 00:43:08+01:00 4.06 1
In [121]:
df1_month = df1_month.groupby("month").aggregate(np.mean)
In [122]:
df2_month = df2.reset_index()
In [123]:
df2_month["month"] = df2_month.time.apply(lambda x: x.month)
In [124]:
df2_month = df2_month.groupby("month").aggregate(np.mean)
In [125]:
df_month = df1_month.join(df2_month)
In [126]:
df_month.head(3)
Out[126]:
outdoor indoor
month
1 -1.776646 19.862590
2 2.231613 20.231507
3 4.615437 19.597748
In [127]:
df_month = pd.concat([df.to_period("M").groupby(level=0).mean() for df in [df1, df2]], axis=1)
In [128]:
df_month.head(3)
Out[128]:
outdoor indoor
time
2014-01 -1.776646 19.862590
2014-02 2.231613 20.231507
2014-03 4.615437 19.597748
In [129]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

df_month.plot(kind='bar', ax=axes[0])
df_month.plot(kind='box', ax=axes[1])

fig.tight_layout()
fig.savefig("ch12-grouped-by-month.pdf")
In [130]:
df_month
Out[130]:
outdoor indoor
time
2014-01 -1.776646 19.862590
2014-02 2.231613 20.231507
2014-03 4.615437 19.597748
2014-04 8.105193 22.149754
2014-05 12.261396 26.332160
2014-06 15.586955 28.687491
2014-07 20.780314 30.605333
2014-08 16.494823 28.099068
2014-09 12.823905 26.950366
2014-10 9.352000 23.379460
2014-11 4.992142 20.610365
2014-12 -0.058940 16.465674
In [131]:
# resampling
In [132]:
df1_hour = df1.resample("H")
In [133]:
df1_hour.columns = ["outdoor (hourly avg.)"]
In [134]:
df1_day = df1.resample("D")
In [135]:
df1_day.columns = ["outdoor (daily avg.)"]
In [136]:
df1_week = df1.resample("7D")
In [137]:
df1_week.columns = ["outdoor (weekly avg.)"]
In [138]:
df1_month = df1.resample("M")
In [139]:
df1_month.columns = ["outdoor (monthly avg.)"]
In [140]:
df_diff = (df1.resample("D").outdoor - df2.resample("D").indoor)
In [141]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 6))

df1_hour.plot(ax=ax1, alpha=0.25)
df1_day.plot(ax=ax1)
df1_week.plot(ax=ax1)
df1_month.plot(ax=ax1)

df_diff.plot(ax=ax2)
ax2.set_title("temperature difference between outdoor and indoor")

fig.tight_layout()
fig.savefig("ch12-timeseries-resampled.pdf")
In [142]:
fill_methods = [None, 'ffill', 'bfill']
In [143]:
pd.concat([df1.resample("5min", fill_method=fm).rename(columns={"outdoor": fm})
           for fm in fill_methods], axis=1).head()
Out[143]:
None ffill bfill
time
2014-01-01 00:00:00+01:00 4.38 4.38 4.38
2014-01-01 00:05:00+01:00 NaN 4.38 4.25
2014-01-01 00:10:00+01:00 4.25 4.25 4.25
2014-01-01 00:15:00+01:00 NaN 4.25 4.19
2014-01-01 00:20:00+01:00 4.19 4.19 4.19

Selected day

In [144]:
df1_dec25 = df1[(df1.index < "2014-9-1") & (df1.index >= "2014-8-1")].resample("D")
In [145]:
df1_dec25 = df1.ix["2014-12-25"]
In [146]:
df1_dec25.head(5)
Out[146]:
outdoor
time
2014-12-25 00:01:45+01:00 -4.88
2014-12-25 00:11:45+01:00 -4.94
2014-12-25 00:21:46+01:00 -5.06
2014-12-25 00:31:47+01:00 -5.06
2014-12-25 00:41:47+01:00 -5.12
In [147]:
df2_dec25 = df2.ix["2014-12-25"]
In [148]:
df2_dec25.head(5)
Out[148]:
indoor
time
2014-12-25 00:01:45+01:00 16.31
2014-12-25 00:11:45+01:00 16.25
2014-12-25 00:21:46+01:00 16.31
2014-12-25 00:31:47+01:00 16.31
2014-12-25 00:41:47+01:00 16.25
In [149]:
df1_dec25.describe().T
Out[149]:
count mean std min 25% 50% 75% max
outdoor 144 -7.788819 1.456085 -10.06 -9.075 -7.75 -6.8625 -4.88
In [150]:
fig, ax = plt.subplots(1, 1, figsize=(12, 4))

df1_dec25.plot(ax=ax)

fig.savefig("ch12-timeseries-selected-month.pdf")
In [151]:
df1.index
Out[151]:
DatetimeIndex(['2014-01-01 00:03:06+01:00', '2014-01-01 00:13:06+01:00',
               '2014-01-01 00:23:07+01:00', '2014-01-01 00:33:07+01:00',
               '2014-01-01 00:43:08+01:00', '2014-01-01 00:53:08+01:00',
               '2014-01-01 01:03:09+01:00', '2014-01-01 01:13:09+01:00',
               '2014-01-01 01:23:10+01:00', '2014-01-01 01:33:26+01:00', 
               ...
               '2014-12-30 22:26:30+01:00', '2014-12-30 22:36:31+01:00',
               '2014-12-30 22:46:31+01:00', '2014-12-30 22:56:32+01:00',
               '2014-12-30 23:06:32+01:00', '2014-12-30 23:16:33+01:00',
               '2014-12-30 23:26:33+01:00', '2014-12-30 23:36:34+01:00',
               '2014-12-30 23:46:35+01:00', '2014-12-30 23:56:35+01:00'],
              dtype='datetime64[ns]', name=u'time', length=49548, freq=None, tz='Europe/Stockholm')

Seaborn statistical visualization library

In [152]:
import seaborn as sns
In [153]:
sns.set(style="darkgrid")
In [154]:
#sns.set(style="whitegrid")
In [155]:
df1 = pd.read_csv('temperature_outdoor_2014.tsv', delimiter="\t", names=["time", "outdoor"])
df1.time = pd.to_datetime(df1.time.values, unit="s").tz_localize('UTC').tz_convert('Europe/Stockholm')
df1 = df1.set_index("time").resample("10min")
df2 = pd.read_csv('temperature_indoor_2014.tsv', delimiter="\t", names=["time", "indoor"])
df2.time = pd.to_datetime(df2.time.values, unit="s").tz_localize('UTC').tz_convert('Europe/Stockholm')
df2 = df2.set_index("time").resample("10min")
df_temp = pd.concat([df1, df2], axis=1)
In [156]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
df_temp.resample("D").plot(y=["outdoor", "indoor"], ax=ax)
fig.tight_layout()
fig.savefig("ch12-seaborn-plot.pdf")
In [157]:
#sns.kdeplot(df_temp["outdoor"].dropna().values, shade=True, cumulative=True);
In [158]:
sns.distplot(df_temp.to_period("M")["outdoor"]["2014-04"].dropna().values, bins=50);
sns.distplot(df_temp.to_period("M")["indoor"]["2014-04"].dropna().values, bins=50);

plt.savefig("ch12-seaborn-distplot.pdf")
In [159]:
with sns.axes_style("white"):
    sns.jointplot(df_temp.resample("H")["outdoor"].values,
                  df_temp.resample("H")["indoor"].values, kind="hex");
    
plt.savefig("ch12-seaborn-jointplot.pdf")
In [160]:
sns.kdeplot(df_temp.resample("H")["outdoor"].dropna().values,
            df_temp.resample("H")["indoor"].dropna().values, shade=False);

plt.savefig("ch12-seaborn-kdeplot.pdf")
In [163]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))

sns.boxplot(df_temp.dropna(), ax=ax1, palette="pastel")
sns.violinplot(df_temp.dropna(), ax=ax2, palette="pastel")

fig.tight_layout()
fig.savefig("ch12-seaborn-boxplot-violinplot.pdf")
In [164]:
sns.violinplot(x=df_temp.dropna().index.month, y=df_temp.dropna().outdoor, color="skyblue");

plt.savefig("ch12-seaborn-violinplot.pdf")
In [165]:
df_temp["month"] = df_temp.index.month
df_temp["hour"] = df_temp.index.hour
In [166]:
df_temp.head()
Out[166]:
outdoor indoor month hour
time
2014-01-01 00:00:00+01:00 4.38 21.94 1 0
2014-01-01 00:10:00+01:00 4.25 22.00 1 0
2014-01-01 00:20:00+01:00 4.19 22.00 1 0
2014-01-01 00:30:00+01:00 4.06 22.00 1 0
2014-01-01 00:40:00+01:00 4.06 22.00 1 0
In [167]:
table = pd.pivot_table(df_temp, values='outdoor', index=['month'], columns=['hour'], aggfunc=np.mean)
In [168]:
table
Out[168]:
hour 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
month
1 -1.692312 -1.750162 -1.826649 -1.879086 -1.922527 -1.968065 -2.020914 -2.035806 -2.101774 -2.001022 ... -1.457849 -1.696935 -1.814194 -1.812258 -1.853297 -1.898432 -1.839730 -1.806486 -1.854462 -1.890811
2 1.613690 1.521190 1.479405 1.464371 1.506407 1.485595 1.499167 1.516946 1.669226 2.067725 ... 3.573593 3.360741 2.939390 2.501607 2.357425 2.236190 2.204458 2.137619 2.024671 1.896190
3 3.192366 2.866774 2.628000 2.524140 2.384140 2.235538 2.243387 2.622258 3.419301 4.466290 ... 7.790323 7.930914 7.595892 6.770914 5.731508 4.983784 4.437419 4.022312 3.657903 3.407258
4 5.832738 5.336012 4.926667 4.597059 4.380000 4.109769 4.123699 4.741437 5.878035 7.272299 ... 12.175556 12.500059 12.494483 12.361156 11.989240 10.454881 8.857619 7.712619 6.974762 6.293512
5 9.792204 9.369351 9.009839 8.670914 8.463387 8.446919 8.772324 9.562742 10.723622 12.047717 ... 15.542581 15.744624 15.784839 15.799570 17.009892 15.685161 13.632796 12.216290 11.291237 10.622849
6 13.209556 12.792889 12.382889 11.967889 11.735778 11.886667 12.503778 13.338167 14.343444 15.665475 ... 18.630556 18.866292 18.680611 18.529832 20.057877 18.853389 16.969777 15.675111 14.658778 13.898167
7 17.956344 17.348641 16.793152 16.309892 16.001559 15.986774 16.506613 17.478226 18.850054 20.533763 ... 24.598441 25.030000 24.869194 24.764409 26.155161 24.896505 22.550269 20.882649 19.699022 18.822634
8 14.498205 13.960128 13.555128 12.995641 12.651410 12.485974 12.680130 13.403506 14.578780 16.170833 ... 20.473810 20.292381 20.328795 19.642436 19.373846 18.713462 17.034872 15.843590 15.146154 14.596667
9 11.133000 10.725667 10.362444 9.976833 9.729333 9.503944 9.357500 9.689778 10.600778 11.829106 ... 16.336983 16.828268 17.031056 16.786983 15.853556 14.534637 13.350444 12.545278 11.954190 11.399056
10 8.602011 8.490598 8.382486 8.257097 8.166774 8.140054 8.140161 8.148333 8.410914 9.054946 ... 11.330323 11.189194 10.836865 10.361568 9.781022 9.373441 9.134570 8.956505 8.820270 8.623297
11 4.847111 4.765922 4.815642 4.773240 4.809611 4.785833 4.741222 4.739778 4.794500 4.965389 ... 5.526034 5.342753 5.081250 5.056629 4.959106 4.868111 4.833333 4.774389 4.720722 4.699722
12 -0.366369 -0.390556 -0.447374 -0.370111 -0.353128 -0.319832 -0.358667 -0.410278 -0.483167 -0.344667 ... 0.738944 0.367056 0.152167 -0.106111 -0.182500 -0.244167 -0.290000 -0.305333 -0.302778 -0.325642

12 rows × 24 columns

In [169]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
sns.heatmap(table, ax=ax);

fig.tight_layout()
fig.savefig("ch12-seaborn-heatmap.pdf")

Versions

In [170]:
%reload_ext version_information
In [171]:
%version_information numpy, matplotlib, pandas, seaborn
Out[171]:
SoftwareVersion
Python2.7.10 64bit [GCC 4.2.1 (Apple Inc. build 5577)]
IPython3.2.1
OSDarwin 14.1.0 x86_64 i386 64bit
numpy1.9.2
matplotlib1.4.3
pandas0.16.2
seaborn0.6.0