Air Quality

Predicting air quality with Prophet Feature description here

In [3]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt

%matplotlib inline

Import dataset

In [4]:
DATAPATH = 'data/AirQualityUCI.csv'

data = pd.read_csv(DATAPATH, sep=';')
data.head()
Out[4]:
Date Time CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH AH Unnamed: 15 Unnamed: 16
0 10/03/2004 18.00.00 2,6 1360.0 150.0 11,9 1046.0 166.0 1056.0 113.0 1692.0 1268.0 13,6 48,9 0,7578 NaN NaN
1 10/03/2004 19.00.00 2 1292.0 112.0 9,4 955.0 103.0 1174.0 92.0 1559.0 972.0 13,3 47,7 0,7255 NaN NaN
2 10/03/2004 20.00.00 2,2 1402.0 88.0 9,0 939.0 131.0 1140.0 114.0 1555.0 1074.0 11,9 54,0 0,7502 NaN NaN
3 10/03/2004 21.00.00 2,2 1376.0 80.0 9,2 948.0 172.0 1092.0 122.0 1584.0 1203.0 11,0 60,0 0,7867 NaN NaN
4 10/03/2004 22.00.00 1,6 1272.0 51.0 6,5 836.0 131.0 1205.0 116.0 1490.0 1110.0 11,2 59,6 0,7888 NaN NaN
In [5]:
data.shape
Out[5]:
(9471, 17)
In [6]:
data.dropna(axis=1, how='all', inplace=True)
data.dropna(axis=0, how='all', inplace=True)
In [7]:
data.shape
Out[7]:
(9357, 15)
In [8]:
data.head()
Out[8]:
Date Time CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH AH
0 10/03/2004 18.00.00 2,6 1360.0 150.0 11,9 1046.0 166.0 1056.0 113.0 1692.0 1268.0 13,6 48,9 0,7578
1 10/03/2004 19.00.00 2 1292.0 112.0 9,4 955.0 103.0 1174.0 92.0 1559.0 972.0 13,3 47,7 0,7255
2 10/03/2004 20.00.00 2,2 1402.0 88.0 9,0 939.0 131.0 1140.0 114.0 1555.0 1074.0 11,9 54,0 0,7502
3 10/03/2004 21.00.00 2,2 1376.0 80.0 9,2 948.0 172.0 1092.0 122.0 1584.0 1203.0 11,0 60,0 0,7867
4 10/03/2004 22.00.00 1,6 1272.0 51.0 6,5 836.0 131.0 1205.0 116.0 1490.0 1110.0 11,2 59,6 0,7888

Data cleaning and feature engineering

In [9]:
data['Date'] = pd.to_datetime(data['Date'])
In [10]:
for col in data.iloc[:,2:].columns:
    if data[col].dtypes == object:
        data[col] = data[col].str.replace(',', '.').astype('float')
In [11]:
def positive_average(num):
    return num[num > -200].mean()
    
daily_data = data.drop('Time', axis=1).groupby('Date').apply(positive_average)
In [12]:
daily_data.head()
Out[12]:
CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH AH
Date
2004-01-04 2.550000 1244.380952 310.125 11.971429 1010.428571 161.304348 880.666667 96.739130 1644.380952 1155.857143 15.519048 51.133333 0.854881
2004-01-05 2.056522 1097.500000 275.000 8.662500 913.708333 104.739130 918.083333 84.956522 1640.416667 904.625000 20.150000 52.183333 1.167312
2004-01-06 2.100000 1135.583333 NaN 12.375000 1021.875000 152.043478 896.791667 75.869565 1881.500000 1066.958333 20.325000 66.154167 1.533350
2004-01-07 2.162500 1130.583333 NaN 12.225000 1038.541667 139.695652 740.916667 113.434783 1854.250000 1059.625000 30.450000 39.691667 1.624108
2004-01-08 0.983333 974.166667 NaN 5.808333 792.583333 51.739130 880.083333 58.521739 1559.000000 670.583333 30.654167 42.120833 1.673521
In [13]:
daily_data.isna().sum() > 8
Out[13]:
CO(GT)            True
PT08.S1(CO)      False
NMHC(GT)          True
C6H6(GT)         False
PT08.S2(NMHC)    False
NOx(GT)           True
PT08.S3(NOx)     False
NO2(GT)           True
PT08.S4(NO2)     False
PT08.S5(O3)      False
T                False
RH               False
AH               False
dtype: bool
In [14]:
daily_data = daily_data.iloc[:,(daily_data.isna().sum() <= 8).values]
In [15]:
daily_data.head()
Out[15]:
PT08.S1(CO) C6H6(GT) PT08.S2(NMHC) PT08.S3(NOx) PT08.S4(NO2) PT08.S5(O3) T RH AH
Date
2004-01-04 1244.380952 11.971429 1010.428571 880.666667 1644.380952 1155.857143 15.519048 51.133333 0.854881
2004-01-05 1097.500000 8.662500 913.708333 918.083333 1640.416667 904.625000 20.150000 52.183333 1.167312
2004-01-06 1135.583333 12.375000 1021.875000 896.791667 1881.500000 1066.958333 20.325000 66.154167 1.533350
2004-01-07 1130.583333 12.225000 1038.541667 740.916667 1854.250000 1059.625000 30.450000 39.691667 1.624108
2004-01-08 974.166667 5.808333 792.583333 880.083333 1559.000000 670.583333 30.654167 42.120833 1.673521
In [16]:
daily_data.shape
Out[16]:
(391, 9)
In [17]:
daily_data = daily_data.dropna()
In [18]:
daily_data.shape
Out[18]:
(383, 9)
In [19]:
weekly_data = daily_data.resample('W').mean()
In [20]:
weekly_data.head()
Out[20]:
PT08.S1(CO) C6H6(GT) PT08.S2(NMHC) PT08.S3(NOx) PT08.S4(NO2) PT08.S5(O3) T RH AH
Date
2004-01-04 1244.380952 11.971429 1010.428571 880.666667 1644.380952 1155.857143 15.519048 51.133333 0.854881
2004-01-11 1136.801760 11.674457 1009.344462 760.484990 1727.833075 1083.683747 24.564726 53.224017 1.526858
2004-01-18 1173.375000 13.429167 1050.458333 1490.333333 1448.541667 1196.333333 10.891667 77.000000 1.002796
2004-01-25 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2004-02-01 NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [21]:
weekly_data = weekly_data.dropna()
In [22]:
weekly_data.shape
Out[22]:
(73, 9)
In [23]:
def plot_data(col):
    plt.figure(figsize=(17, 8))
    plt.plot(weekly_data[col])
    plt.xlabel('Time')
    plt.ylabel(col)
    plt.grid(False)
    plt.show()
    
for col in weekly_data.columns:
    plot_data(col)

Modelling

Let's focus on predictig the concentration of NOx. Oxides of nitrogen react to form smog and acid rain, as well as being central to the formation of fine particles and ground level ozone, both of which are associated with adverse health effects.

In [24]:
cols_to_drop = ['PT08.S1(CO)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']

weekly_data = weekly_data.drop(cols_to_drop, axis=1)
In [25]:
weekly_data.head()
Out[25]:
PT08.S3(NOx)
Date
2004-01-04 880.666667
2004-01-11 760.484990
2004-01-18 1490.333333
2004-02-08 869.108333
2004-02-15 706.395833
In [26]:
from fbprophet import Prophet
import logging

logging.getLogger().setLevel(logging.ERROR)
In [27]:
df = weekly_data.reset_index()
df.columns = ['ds', 'y']
df.head()
Out[27]:
ds y
0 2004-01-04 880.666667
1 2004-01-11 760.484990
2 2004-01-18 1490.333333
3 2004-02-08 869.108333
4 2004-02-15 706.395833
In [28]:
prediction_size = 30
train_df = df[:-prediction_size]
In [29]:
m = Prophet()
m.fit(train_df)
Out[29]:
<fbprophet.forecaster.Prophet at 0x121ec1390>
In [30]:
future = m.make_future_dataframe(periods=prediction_size)

forecast = m.predict(future)
In [31]:
m.plot(forecast)
Out[31]:
In [32]:
m.plot_components(forecast)
Out[32]:
In [33]:
def make_comparison_dataframe(historical, forecast):
    return forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(historical.set_index('ds'))
In [34]:
cmp_df = make_comparison_dataframe(df, forecast)

cmp_df.head()
Out[34]:
yhat yhat_lower yhat_upper y
ds
2004-01-04 946.024230 796.145712 1111.430767 880.666667
2004-01-11 943.202988 791.485768 1106.121910 760.484990
2004-01-18 940.381746 768.280827 1092.981803 1490.333333
2004-02-08 931.918021 774.699096 1085.124113 869.108333
2004-02-15 929.096780 767.145584 1091.778626 706.395833
In [35]:
def calculate_forecast_errors(df, prediction_size):
    
    df = df.copy()
    
    df['e'] = df['y'] - df['yhat']
    df['p'] = 100 * df['e'] / df['y']
    
    predicted_part = df[-prediction_size:]
    
    error_mean = lambda error_name: np.mean(np.abs(predicted_part[error_name]))
    
    return {'MAPE': error_mean('p'), 'MAE': error_mean('e')}
In [36]:
for err_name, err_value in calculate_forecast_errors(cmp_df, prediction_size).items():
    print(err_name, err_value)
MAPE 13.858454067188074
MAE 109.32387955182074
In [37]:
plt.figure(figsize=(17, 8))
plt.plot(cmp_df['yhat'])
plt.plot(cmp_df['yhat_lower'])
plt.plot(cmp_df['yhat_upper'])
plt.plot(cmp_df['y'])
plt.xlabel('Time')
plt.ylabel('Average Weekly NOx Concentration')
plt.grid(False)
plt.show()