--by Lu Tang
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# load data
unrate = pd.read_csv('unrate.csv')
# view data
unrate.head()
DATE | VALUE | |
---|---|---|
0 | 1948-01-01 | 3.4 |
1 | 1948-02-01 | 3.8 |
2 | 1948-03-01 | 4.0 |
3 | 1948-04-01 | 3.9 |
4 | 1948-05-01 | 3.5 |
# view last 5 rows of the data
unrate.tail()
DATE | VALUE | |
---|---|---|
819 | 2016-04-01 | 5.0 |
820 | 2016-05-01 | 4.7 |
821 | 2016-06-01 | 4.9 |
822 | 2016-07-01 | 4.9 |
823 | 2016-08-01 | 4.9 |
# Check information
unrate.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 824 entries, 0 to 823 Data columns (total 2 columns): DATE 824 non-null object VALUE 824 non-null float64 dtypes: float64(1), object(1) memory usage: 13.0+ KB
# Convert Object to datetime
unrate['DATE']=pd.to_datetime(unrate['DATE'])
unrate.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 824 entries, 0 to 823 Data columns (total 2 columns): DATE 824 non-null datetime64[ns] VALUE 824 non-null float64 dtypes: datetime64[ns](1), float64(1) memory usage: 13.0 KB
# Create a new column for Month using Pandas datime method
unrate['MONTH'] = unrate['DATE'].dt.month
unrate['YEAR'] = unrate['DATE'].dt.year
# view the table
unrate.tail(10)
DATE | VALUE | MONTH | YEAR | |
---|---|---|---|---|
814 | 2015-11-01 | 5.0 | 11 | 2015 |
815 | 2015-12-01 | 5.0 | 12 | 2015 |
816 | 2016-01-01 | 4.9 | 1 | 2016 |
817 | 2016-02-01 | 4.9 | 2 | 2016 |
818 | 2016-03-01 | 5.0 | 3 | 2016 |
819 | 2016-04-01 | 5.0 | 4 | 2016 |
820 | 2016-05-01 | 4.7 | 5 | 2016 |
821 | 2016-06-01 | 4.9 | 6 | 2016 |
822 | 2016-07-01 | 4.9 | 7 | 2016 |
823 | 2016-08-01 | 4.9 | 8 | 2016 |
fig = plt.figure(figsize=(10,5))
# use a for loop to plot line charts from 2011 to 2015
colors = ['red', 'blue', 'green', 'orange']
for i in range(4):
start_index = i*12
end_index = (i+1)*12
subset = unrate[start_index:end_index]
label = str(2012 + i) # add labels
plt.plot(subset['MONTH'], subset['VALUE'], c=colors[i], label=label)
# since 2016 only has data to August, we will plot separately
plt.plot(unrate[816:]['MONTH'], unrate[816:]['VALUE'], c='purple', label='2016')
plt.xlabel('Month')
plt.ylabel('Unemployment Rate, Percent')
plt.title('Monthly Unemployment Trends, 2012-2016')
plt.legend(loc='upper left')
plt.show()
# save the chart
fig.savefig('Monthly Unemployment Trends, 2012-2016.png')
# use groupby function to find average unemployment rate for each year
unrate_year=unrate.groupby('YEAR').mean()['VALUE']
unrate_year.head()
YEAR 1948 3.750000 1949 6.050000 1950 5.208333 1951 3.283333 1952 3.025000 Name: VALUE, dtype: float64
# plot a line chart using pandas' visualization tool
unrate_year.plot(figsize=(10,5),title='Annually Average Unemployment Rate, 1948-2016')
plt.ylabel('Unemployment Rate, Percent')
Text(0, 0.5, 'Unemployment Rate, Percent')
# use seaborn and set different style
import seaborn as sns
sns.set(style='darkgrid', context='talk', palette='Dark2')
# loading data again
df = pd.read_csv('unrate.csv')
df.head()
DATE | VALUE | |
---|---|---|
0 | 1948-01-01 | 3.4 |
1 | 1948-02-01 | 3.8 |
2 | 1948-03-01 | 4.0 |
3 | 1948-04-01 | 3.9 |
4 | 1948-05-01 | 3.5 |
# Convert to datetime
df['DATE']=pd.to_datetime(df['DATE'])
# Set 'DATE' as index
df.index = df['DATE']
# drop the DATE columns
df.drop('DATE', axis=1, inplace=True)
# Calculating the short-window simple moving average
df['short_rolling'] = df['VALUE'].rolling(window=20).mean()
# Calculating the long-window simple moving average
df['long_rolling'] = df['VALUE'].rolling(100).mean()
df.head(20)
VALUE | short_rolling | long_rolling | |
---|---|---|---|
DATE | |||
1948-01-01 | 3.4 | NaN | NaN |
1948-02-01 | 3.8 | NaN | NaN |
1948-03-01 | 4.0 | NaN | NaN |
1948-04-01 | 3.9 | NaN | NaN |
1948-05-01 | 3.5 | NaN | NaN |
1948-06-01 | 3.6 | NaN | NaN |
1948-07-01 | 3.6 | NaN | NaN |
1948-08-01 | 3.9 | NaN | NaN |
1948-09-01 | 3.8 | NaN | NaN |
1948-10-01 | 3.7 | NaN | NaN |
1948-11-01 | 3.8 | NaN | NaN |
1948-12-01 | 4.0 | NaN | NaN |
1949-01-01 | 4.3 | NaN | NaN |
1949-02-01 | 4.7 | NaN | NaN |
1949-03-01 | 5.0 | NaN | NaN |
1949-04-01 | 5.3 | NaN | NaN |
1949-05-01 | 6.1 | NaN | NaN |
1949-06-01 | 6.2 | NaN | NaN |
1949-07-01 | 6.7 | NaN | NaN |
1949-08-01 | 6.8 | 4.505 | NaN |
# view last 5 rows
df.tail()
VALUE | short_rolling | long_rolling | |
---|---|---|---|
DATE | |||
2016-04-01 | 5.0 | 5.315 | 7.461 |
2016-05-01 | 4.7 | 5.250 | 7.458 |
2016-06-01 | 4.9 | 5.210 | 7.458 |
2016-07-01 | 4.9 | 5.165 | 7.456 |
2016-08-01 | 4.9 | 5.130 | 7.455 |
# plotting line charts to compare the orignal data with the moving average data
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(df['VALUE'], label='VALUE')
ax.plot(df['short_rolling'], label = 'short_rolling')
ax.plot(df['long_rolling'], label = 'long_rolling')
plt.title('USA Unemployment Trends, 1948-2016')
ax.legend(loc='best')
ax.set_ylabel('unemployment_rate')
Text(0, 0.5, 'unemployment_rate')
- As we can see short rolling line is very similar with original data, long rolling line is smoothier.
- We can also observe that the unemployment rate is higher in recent years, but in general, it shows high fluctuation
# Calculating the moving average with window=50
df['mid_rolling'] = df['VALUE'].rolling(50).mean()
# plotting line chart for the moving average with window=50, mid_rolling
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(df['mid_rolling'])
plt.title('USA Unemployment Trends, 1948-2016')
ax.set_ylabel('unemployment_rate')
Text(0, 0.5, 'unemployment_rate')
# save the chart
fig.savefig('USA Unemployment Trend.png')