Predicting air quality with Prophet Feature description here
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
DATAPATH = 'data/AirQualityUCI.csv'
data = pd.read_csv(DATAPATH, sep=';')
data.head()
Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | Unnamed: 15 | Unnamed: 16 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10/03/2004 | 18.00.00 | 2,6 | 1360.0 | 150.0 | 11,9 | 1046.0 | 166.0 | 1056.0 | 113.0 | 1692.0 | 1268.0 | 13,6 | 48,9 | 0,7578 | NaN | NaN |
1 | 10/03/2004 | 19.00.00 | 2 | 1292.0 | 112.0 | 9,4 | 955.0 | 103.0 | 1174.0 | 92.0 | 1559.0 | 972.0 | 13,3 | 47,7 | 0,7255 | NaN | NaN |
2 | 10/03/2004 | 20.00.00 | 2,2 | 1402.0 | 88.0 | 9,0 | 939.0 | 131.0 | 1140.0 | 114.0 | 1555.0 | 1074.0 | 11,9 | 54,0 | 0,7502 | NaN | NaN |
3 | 10/03/2004 | 21.00.00 | 2,2 | 1376.0 | 80.0 | 9,2 | 948.0 | 172.0 | 1092.0 | 122.0 | 1584.0 | 1203.0 | 11,0 | 60,0 | 0,7867 | NaN | NaN |
4 | 10/03/2004 | 22.00.00 | 1,6 | 1272.0 | 51.0 | 6,5 | 836.0 | 131.0 | 1205.0 | 116.0 | 1490.0 | 1110.0 | 11,2 | 59,6 | 0,7888 | NaN | NaN |
data.shape
(9471, 17)
data.dropna(axis=1, how='all', inplace=True)
data.dropna(axis=0, how='all', inplace=True)
data.shape
(9357, 15)
data.head()
Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10/03/2004 | 18.00.00 | 2,6 | 1360.0 | 150.0 | 11,9 | 1046.0 | 166.0 | 1056.0 | 113.0 | 1692.0 | 1268.0 | 13,6 | 48,9 | 0,7578 |
1 | 10/03/2004 | 19.00.00 | 2 | 1292.0 | 112.0 | 9,4 | 955.0 | 103.0 | 1174.0 | 92.0 | 1559.0 | 972.0 | 13,3 | 47,7 | 0,7255 |
2 | 10/03/2004 | 20.00.00 | 2,2 | 1402.0 | 88.0 | 9,0 | 939.0 | 131.0 | 1140.0 | 114.0 | 1555.0 | 1074.0 | 11,9 | 54,0 | 0,7502 |
3 | 10/03/2004 | 21.00.00 | 2,2 | 1376.0 | 80.0 | 9,2 | 948.0 | 172.0 | 1092.0 | 122.0 | 1584.0 | 1203.0 | 11,0 | 60,0 | 0,7867 |
4 | 10/03/2004 | 22.00.00 | 1,6 | 1272.0 | 51.0 | 6,5 | 836.0 | 131.0 | 1205.0 | 116.0 | 1490.0 | 1110.0 | 11,2 | 59,6 | 0,7888 |
data['Date'] = pd.to_datetime(data['Date'])
for col in data.iloc[:,2:].columns:
if data[col].dtypes == object:
data[col] = data[col].str.replace(',', '.').astype('float')
def positive_average(num):
return num[num > -200].mean()
daily_data = data.drop('Time', axis=1).groupby('Date').apply(positive_average)
daily_data.head()
CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | |||||||||||||
2004-01-04 | 2.550000 | 1244.380952 | 310.125 | 11.971429 | 1010.428571 | 161.304348 | 880.666667 | 96.739130 | 1644.380952 | 1155.857143 | 15.519048 | 51.133333 | 0.854881 |
2004-01-05 | 2.056522 | 1097.500000 | 275.000 | 8.662500 | 913.708333 | 104.739130 | 918.083333 | 84.956522 | 1640.416667 | 904.625000 | 20.150000 | 52.183333 | 1.167312 |
2004-01-06 | 2.100000 | 1135.583333 | NaN | 12.375000 | 1021.875000 | 152.043478 | 896.791667 | 75.869565 | 1881.500000 | 1066.958333 | 20.325000 | 66.154167 | 1.533350 |
2004-01-07 | 2.162500 | 1130.583333 | NaN | 12.225000 | 1038.541667 | 139.695652 | 740.916667 | 113.434783 | 1854.250000 | 1059.625000 | 30.450000 | 39.691667 | 1.624108 |
2004-01-08 | 0.983333 | 974.166667 | NaN | 5.808333 | 792.583333 | 51.739130 | 880.083333 | 58.521739 | 1559.000000 | 670.583333 | 30.654167 | 42.120833 | 1.673521 |
daily_data.isna().sum() > 8
CO(GT) True PT08.S1(CO) False NMHC(GT) True C6H6(GT) False PT08.S2(NMHC) False NOx(GT) True PT08.S3(NOx) False NO2(GT) True PT08.S4(NO2) False PT08.S5(O3) False T False RH False AH False dtype: bool
daily_data = daily_data.iloc[:,(daily_data.isna().sum() <= 8).values]
daily_data.head()
PT08.S1(CO) | C6H6(GT) | PT08.S2(NMHC) | PT08.S3(NOx) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
---|---|---|---|---|---|---|---|---|---|
Date | |||||||||
2004-01-04 | 1244.380952 | 11.971429 | 1010.428571 | 880.666667 | 1644.380952 | 1155.857143 | 15.519048 | 51.133333 | 0.854881 |
2004-01-05 | 1097.500000 | 8.662500 | 913.708333 | 918.083333 | 1640.416667 | 904.625000 | 20.150000 | 52.183333 | 1.167312 |
2004-01-06 | 1135.583333 | 12.375000 | 1021.875000 | 896.791667 | 1881.500000 | 1066.958333 | 20.325000 | 66.154167 | 1.533350 |
2004-01-07 | 1130.583333 | 12.225000 | 1038.541667 | 740.916667 | 1854.250000 | 1059.625000 | 30.450000 | 39.691667 | 1.624108 |
2004-01-08 | 974.166667 | 5.808333 | 792.583333 | 880.083333 | 1559.000000 | 670.583333 | 30.654167 | 42.120833 | 1.673521 |
daily_data.shape
(391, 9)
daily_data = daily_data.dropna()
daily_data.shape
(383, 9)
weekly_data = daily_data.resample('W').mean()
weekly_data.head()
PT08.S1(CO) | C6H6(GT) | PT08.S2(NMHC) | PT08.S3(NOx) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
---|---|---|---|---|---|---|---|---|---|
Date | |||||||||
2004-01-04 | 1244.380952 | 11.971429 | 1010.428571 | 880.666667 | 1644.380952 | 1155.857143 | 15.519048 | 51.133333 | 0.854881 |
2004-01-11 | 1136.801760 | 11.674457 | 1009.344462 | 760.484990 | 1727.833075 | 1083.683747 | 24.564726 | 53.224017 | 1.526858 |
2004-01-18 | 1173.375000 | 13.429167 | 1050.458333 | 1490.333333 | 1448.541667 | 1196.333333 | 10.891667 | 77.000000 | 1.002796 |
2004-01-25 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2004-02-01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
weekly_data = weekly_data.dropna()
weekly_data.shape
(73, 9)
def plot_data(col):
plt.figure(figsize=(17, 8))
plt.plot(weekly_data[col])
plt.xlabel('Time')
plt.ylabel(col)
plt.grid(False)
plt.show()
for col in weekly_data.columns:
plot_data(col)
Let's focus on predictig the concentration of NOx. Oxides of nitrogen react to form smog and acid rain, as well as being central to the formation of fine particles and ground level ozone, both of which are associated with adverse health effects.
cols_to_drop = ['PT08.S1(CO)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']
weekly_data = weekly_data.drop(cols_to_drop, axis=1)
weekly_data.head()
PT08.S3(NOx) | |
---|---|
Date | |
2004-01-04 | 880.666667 |
2004-01-11 | 760.484990 |
2004-01-18 | 1490.333333 |
2004-02-08 | 869.108333 |
2004-02-15 | 706.395833 |
from fbprophet import Prophet
import logging
logging.getLogger().setLevel(logging.ERROR)
df = weekly_data.reset_index()
df.columns = ['ds', 'y']
df.head()
ds | y | |
---|---|---|
0 | 2004-01-04 | 880.666667 |
1 | 2004-01-11 | 760.484990 |
2 | 2004-01-18 | 1490.333333 |
3 | 2004-02-08 | 869.108333 |
4 | 2004-02-15 | 706.395833 |
prediction_size = 30
train_df = df[:-prediction_size]
m = Prophet()
m.fit(train_df)
<fbprophet.forecaster.Prophet at 0x121ec1390>
future = m.make_future_dataframe(periods=prediction_size)
forecast = m.predict(future)
m.plot(forecast)
m.plot_components(forecast)
def make_comparison_dataframe(historical, forecast):
return forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(historical.set_index('ds'))
cmp_df = make_comparison_dataframe(df, forecast)
cmp_df.head()
yhat | yhat_lower | yhat_upper | y | |
---|---|---|---|---|
ds | ||||
2004-01-04 | 946.024230 | 796.145712 | 1111.430767 | 880.666667 |
2004-01-11 | 943.202988 | 791.485768 | 1106.121910 | 760.484990 |
2004-01-18 | 940.381746 | 768.280827 | 1092.981803 | 1490.333333 |
2004-02-08 | 931.918021 | 774.699096 | 1085.124113 | 869.108333 |
2004-02-15 | 929.096780 | 767.145584 | 1091.778626 | 706.395833 |
def calculate_forecast_errors(df, prediction_size):
df = df.copy()
df['e'] = df['y'] - df['yhat']
df['p'] = 100 * df['e'] / df['y']
predicted_part = df[-prediction_size:]
error_mean = lambda error_name: np.mean(np.abs(predicted_part[error_name]))
return {'MAPE': error_mean('p'), 'MAE': error_mean('e')}
for err_name, err_value in calculate_forecast_errors(cmp_df, prediction_size).items():
print(err_name, err_value)
MAPE 13.858454067188074 MAE 109.32387955182074
plt.figure(figsize=(17, 8))
plt.plot(cmp_df['yhat'])
plt.plot(cmp_df['yhat_lower'])
plt.plot(cmp_df['yhat_upper'])
plt.plot(cmp_df['y'])
plt.xlabel('Time')
plt.ylabel('Average Weekly NOx Concentration')
plt.grid(False)
plt.show()