In [1]:
import pandas as pd
import numpy as np
import math
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import dateutil
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from scipy import stats
/home/pacho/anaconda3/envs/jupyter3/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/home/pacho/anaconda3/envs/jupyter3/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/home/pacho/anaconda3/envs/jupyter3/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/home/pacho/anaconda3/envs/jupyter3/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/home/pacho/anaconda3/envs/jupyter3/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/home/pacho/anaconda3/envs/jupyter3/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/home/pacho/anaconda3/envs/jupyter3/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning:

numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88

Carga y limpieza de datos

In [2]:
df_start = pd.read_excel('../Data/Graficas DustTrack Honeywell 5sept comp.xlsx')
df_start.head()
Out[2]:
Valor DustTrack Valor Honeywell Hora Unnamed: 3
0 26 10 07:06:26 NaN
1 26 10 07:06:27 NaN
2 26 9 07:06:28 NaN
3 25 9 07:06:29 NaN
4 22 9 07:06:30 NaN
In [3]:
# Eliminar nulos y headers repetidos

df_start = df_start[~df_start['Valor DustTrack'].isna()]
df_start = df_start[df_start.columns.values[:-1]]
sum(df_start['Valor DustTrack'] == 'Valor DustTrack')
df_start = df_start[np.logical_not(df_start['Valor DustTrack'] == 'Valor DustTrack')]

df_start.index = np.arange(len(df_start))
In [4]:
# Casting a variables numericas

df_start[['Valor DustTrack', 'Valor Honeywell']] = df_start[['Valor DustTrack', 'Valor Honeywell']].apply(pd.to_numeric)
In [5]:
def plot_time_series(df, variables, filename):    
    data = list()
    for variable in variables:
        data.append(go.Scatter(x = df.index,
                                 y = df[variable],
                                 name=variable))
        
    layout = go.Layout(title='DustTrack vs Honeywell',
                       yaxis=dict(title='Pm 2.5 value'),
                       xaxis=dict(title='N° Observation'))
    fig = go.Figure(data=data, layout=layout)
    plot_url = py.iplot(fig, filename=filename)
    return plot_url
    
# Login plotly
py.sign_in('FranciscoJavierOspinaSalazar', 'WLz5dfnwRjfo2cSwlRiL')

Visualización inicial de los datos y exploraciones univariadas

In [6]:
plot_time_series(df=df_start, variables=['Valor DustTrack', 'Valor Honeywell'], filename='Comparison_sensors')
High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~FranciscoJavierOspinaSalazar/0 or inside your plot.ly account where it is named 'Comparison_sensors'
Out[6]:

Boxplot de ambas mediciones

In [7]:
def build_boxplot(df, variables, filename):
    if isinstance(variables, str):
        variables = [variables]
    data = list()
    for variable in variables:
        data.append(go.Box(y = df[variable],
                           name=variable))
        
    layout = go.Layout(title='Boxplot',
                       yaxis=dict(title='Pm 2.5 value'))
    fig = go.Figure(data=data, layout=layout)
    plot_url = py.iplot(fig, filename=filename)
    return plot_url
    
In [8]:
build_boxplot(df=df_start, variables='Valor DustTrack', filename='Boxplot_DustTrack')
Out[8]:
In [9]:
build_boxplot(df=df_start, variables='Valor Honeywell', filename='Boxplot_Honeywell')
Out[9]:
In [10]:
build_boxplot(df=df_start, variables=['Valor DustTrack', 'Valor Honeywell'], filename='Boxplot_Comparison1')
Out[10]:

Remoción de outliers

In [11]:
df = df_start.copy()
df = df[np.logical_not(np.logical_or(df['Valor Honeywell'] > 90, df['Valor DustTrack'] > 492))]

Suavizamiento de serie con medias móviles

La descripción de las médias móviles y su uso para suavizar picos en series de tiempo se encuentra en http://www.expansion.com/diccionario-economico/media-movil.html

In [12]:
n_movil_mean = 20
movil_mean = df[['Valor DustTrack', 'Valor Honeywell']].rolling(n_movil_mean).mean()
movil_mean.columns = ['Mm_DustTrack', 'Mm_Honeywell']
movil_mean = movil_mean.loc[n_movil_mean - 1:]
In [13]:
## Reescalamiento de los datos con la función Logaritmo
In [14]:
movil_mean = movil_mean.assign(Log10_DustTrack=np.log10(movil_mean['Mm_DustTrack']))
movil_mean = movil_mean.assign(Log10_Honeywell=np.log10(movil_mean['Mm_Honeywell']))
movil_mean.index = np.arange(len(movil_mean))
movil_mean.head()
Out[14]:
Mm_DustTrack Mm_Honeywell Log10_DustTrack Log10_Honeywell
0 22.60 8.40 1.354108 0.924279
1 22.15 8.25 1.345374 0.916454
2 21.80 8.10 1.338456 0.908485
3 21.55 8.00 1.333447 0.903090
4 21.45 7.90 1.331427 0.897627
In [15]:
plot_time_series(df=movil_mean, variables=['Mm_DustTrack', 'Mm_Honeywell'], filename='serie_movil')
Out[15]:

Con el suavizamiento se suprimieron los picos que superaban 2000 en el eje y

In [16]:
plot_time_series(df=movil_mean, variables=['Log10_DustTrack', 'Log10_Honeywell'], filename='log10_serie_movil')
Out[16]:
In [17]:
build_boxplot(df=movil_mean, variables=['Log10_DustTrack', 'Log10_Honeywell'], filename='Bloxplot_log10')
Out[17]:

Partición de la base de datos en traint test

In [18]:
size_train = int(len(movil_mean) * .7)
index_train = np.random.choice(np.arange(len(movil_mean)), size=size_train, replace=False)
index_test = np.setdiff1d(np.arange(len(movil_mean)), index_train)
df_train = movil_mean.iloc[index_train]
df_test = movil_mean.iloc[index_test]
df_train = df_train.sort_index()
df_test = df_test.sort_index()

Escalamiento de los datos

In [19]:
scaler_dt = MinMaxScaler()
scaler_dt.fit(df_train['Mm_DustTrack'].values.reshape(-1, 1))
scaled_DustTrack_train = scaler_dt.transform(df_train['Mm_DustTrack'].values.reshape(-1,1))
scaled_DustTrack_test = scaler_dt.transform(df_test['Mm_DustTrack'].values.reshape(-1,1))

scaler_hw = MinMaxScaler()
scaler_hw.fit(df_train['Mm_Honeywell'].values.reshape(-1, 1))
scaled_Honeywell_train = scaler_hw.transform(df_train['Mm_Honeywell'].values.reshape(-1,1))
scaled_Honeywell_test = scaler_hw.transform(df_test['Mm_Honeywell'].values.reshape(-1,1))

df_train = df_train.assign(scaled_DustTrack=scaled_DustTrack_train)
df_train = df_train.assign(scaled_Honeywell=scaled_Honeywell_train)
df_test = df_test.assign(scaled_DustTrack=scaled_DustTrack_test)
df_test = df_test.assign(scaled_Honeywell=scaled_Honeywell_test)
movil_mean = movil_mean.assign()

df_train.head()
Out[19]:
Mm_DustTrack Mm_Honeywell Log10_DustTrack Log10_Honeywell scaled_DustTrack scaled_Honeywell
0 22.60 8.40 1.354108 0.924279 0.040109 0.078555
2 21.80 8.10 1.338456 0.908485 0.038361 0.075115
5 21.40 7.75 1.330414 0.889302 0.037486 0.071101
7 22.65 7.45 1.355068 0.872156 0.040219 0.067661
9 23.10 7.20 1.363612 0.857332 0.041202 0.064794
In [20]:
build_boxplot(df=df_train, variables=['scaled_DustTrack', 'scaled_Honeywell'], filename='Bloxplot_log10')
Out[20]:

Correlación lineal de Pearson

https://es.wikipedia.org/wiki/Coeficiente_de_correlaci%C3%B3n_de_Pearson

Debido a que solo son dos variables si la correlación de pearson es alta se pueden hacer modelos que intenten generar la variable Valor DustTrack a partir de Valor Honeywell.

In [21]:
# Coeficinte de correlación sobre las variables transformadas por logaritmos de las
# series suavizadas por las media móviles
stats.pearsonr(df_train['Log10_DustTrack'], df_train['Log10_Honeywell'])[0]
Out[21]:
0.977387144188508
In [22]:
stats.pearsonr(df_train['scaled_DustTrack'], df_train['scaled_Honeywell'])[0]
Out[22]:
0.973157336766973

Tanto en la gráfica de la serie de tiempo como en la correlación, estas son las mejores variables hasta el momento para el modelo.

Construcción de modelos

In [23]:
model_min_max_scaler = linear_model.LinearRegression(fit_intercept=True)
model_log_scaler = linear_model.LinearRegression(fit_intercept=True)
model_min_max_scaler.fit(X=df_train['scaled_Honeywell'].values.reshape(-1,1), y=df_train['scaled_DustTrack'].values.reshape(-1,1), )
model_log_scaler.fit(X=df_train['Log10_Honeywell'].values.reshape(-1,1), y=df_train['Log10_DustTrack'].values.reshape(-1,1))
Out[23]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Párametros de los modelos

In [24]:
print('Parameters model with log scaler. Coef = {}, Intercept = {}'.format(model_log_scaler.coef_[0][0], model_log_scaler.intercept_[0]))
print('Parameters model with min_max scaler. Coef = {}, Intercept = {}'.format(model_min_max_scaler.coef_[0][0], model_min_max_scaler.intercept_[0]))
Parameters model with log scaler. Coef = 1.1690681431361163, Intercept = 0.4163977469816482
Parameters model with min_max scaler. Coef = 1.028125900448871, Intercept = -0.025268098493561097

Append de predicciones a las series suavizadas

In [25]:
pred_min_max_scaler_train = scaler_dt.inverse_transform(model_min_max_scaler.predict(df_train['scaled_Honeywell'].values.reshape(-1,1)).reshape(-1,1))
pred_log_scaler_train = np.power(10, model_log_scaler.predict(df_train['Log10_Honeywell'].values.reshape(-1,1)).reshape(-1,1))

df_train = df_train.assign(pred_min_max_scaler=pred_min_max_scaler_train)
df_train = df_train.assign(pred_log_scaler=pred_log_scaler_train)

pred_min_max_scaler_test= scaler_dt.inverse_transform(model_min_max_scaler.predict(df_test['scaled_Honeywell'].values.reshape(-1,1)).reshape(-1,1))
pred_log_scaler_test = np.power(10, model_log_scaler.predict(df_test['Log10_Honeywell'].values.reshape(-1,1)).reshape(-1,1))

df_test = df_test.assign(pred_min_max_scaler=pred_min_max_scaler_test)
df_test = df_test.assign(pred_log_scaler=pred_log_scaler_test)

df_test.head()
Out[25]:
Mm_DustTrack Mm_Honeywell Log10_DustTrack Log10_Honeywell scaled_DustTrack scaled_Honeywell pred_min_max_scaler pred_log_scaler
1 22.15 8.25 1.345374 0.916454 0.039126 0.076835 28.830475 30.746442
3 21.55 8.00 1.333447 0.903090 0.037814 0.073968 27.481944 29.660023
4 21.45 7.90 1.331427 0.897627 0.037596 0.072821 26.942531 29.227050
6 21.80 7.60 1.338456 0.880814 0.038361 0.069381 25.324294 27.933725
8 22.95 7.30 1.360783 0.863323 0.040874 0.065940 23.706057 26.649005

Append de predicciones a las series sin suaizar

In [26]:
df_start = df_start.assign(pred_log_scaler=np.power(10, model_log_scaler.predict(np.log10(df_start['Valor Honeywell']).values.reshape(-1,1)).reshape(-1,1)))

scaled_Honeywell = scaler_hw.transform(df_start['Valor Honeywell'].values.reshape(-1,1))
df_start = df_start.assign(scaled_Honeywell=scaled_Honeywell)
pred_min_max_scaler = scaler_dt.inverse_transform(model_min_max_scaler.predict(df_start['scaled_Honeywell'].values.reshape(-1,1)).reshape(-1,1))
df_start = df_start.assign(pred_min_max_scaler=pred_min_max_scaler)
df_start.head()
Out[26]:
Valor DustTrack Valor Honeywell Hora pred_log_scaler scaled_Honeywell pred_min_max_scaler
0 26 10 07:06:26 38.500457 0.096904 38.270191
1 26 10 07:06:27 38.500457 0.096904 38.270191
2 26 9 07:06:28 34.038645 0.085436 32.876068
3 25 9 07:06:29 34.038645 0.085436 32.876068
4 22 9 07:06:30 34.038645 0.085436 32.876068

Plots del valor real vs la predicción del modelo (sobre serie suavizada / train)

In [28]:
plot_time_series(df=df_train, 
                 variables=['pred_min_max_scaler', 'pred_log_scaler', 'Mm_DustTrack'], 
                 filename='Predictions_01')
Out[28]:

Plots del valor real vs la predicción del modelo (sobre serie suavizada / test)

In [29]:
plot_time_series(df=df_test, 
                 variables=['pred_min_max_scaler', 'pred_log_scaler', 'Mm_DustTrack'], 
                 filename='Predictions_02')
Out[29]:
In [30]:
plot_time_series(df=df_start, 
                 variables=['pred_min_max_scaler', 'pred_log_scaler', 'Valor DustTrack'], 
                 filename='Predictions_03')
The draw time for this plot will be slow for clients without much RAM.
/home/pacho/anaconda3/envs/jupyter3/lib/python3.5/site-packages/plotly/api/v1/clientresp.py:40: UserWarning:

Estimated Draw Time Slow

Out[30]:

Errores de la predicción vs el valor real

In [31]:
dc_errors = dict()
for key, value in {'Train Data': df_train, 'Test Data': df_test, 'Original Data': df_start}.items():
    true_val = 'Mm_DustTrack'
    if key == 'Original Data':
        true_val = 'Valor DustTrack'
    dc_errors[key] = [mean_squared_error(value[true_val], value['pred_log_scaler']),
                      mean_squared_error(value[true_val], value['pred_min_max_scaler']),
                      mean_absolute_error(value[true_val], value['pred_log_scaler']),
                      mean_absolute_error(value[true_val], value['pred_min_max_scaler'])]
df_errors = pd.DataFrame(dc_errors)
df_errors.index = ['Log10 transform MSE', 'Min_max transform MSE', 'Log10 transform MAE', 'Min_max transform MAE']
df_errors[['Train Data', 'Test Data', 'Original Data']]
Out[31]:
Train Data Test Data Original Data
Log10 transform MSE 621.486917 633.921876 3187.670520
Min_max transform MSE 594.730273 613.857632 3137.811488
Log10 transform MAE 17.327988 17.479541 26.216892
Min_max transform MAE 17.616325 17.860014 26.360610

Nota: Los valores de MAE y MSE corresponden al error absoluto medio y error cuadrático medio

In [32]:
def plot_density(df, y_tue, y_pred, group_labels, title, filename):
    hist_data = list()
    for y in y_pred:
        hist_data.append(df[y_tue] - df[y])

    group_labels = group_labels

    # Create distplot with custom bin_size
    fig = ff.create_distplot(hist_data, group_labels, show_hist=False)
    fig['layout'].update(title=title)
    plot_url = py.iplot(fig, filename=filename)
    return plot_url
In [33]:
plot_density(df=df_train, 
             y_tue='Mm_DustTrack', 
             y_pred=['pred_log_scaler', 'pred_min_max_scaler'], 
             group_labels=['Log scaler', 'Min_max scaler'], 
             title='Densidad de los residuales train', 
             filename='Residuals_density_01')
Out[33]:
In [34]:
plot_density(df=df_test, 
             y_tue='Mm_DustTrack', 
             y_pred=['pred_log_scaler', 'pred_min_max_scaler'], 
             group_labels=['Log scaler', 'Min_max scaler'], 
             title='Densidad de los residuales test', 
             filename='Residuals_density_02')
Out[34]:
In [35]:
plot_density(df=df_start, 
             y_tue='Valor DustTrack', 
             y_pred=['pred_log_scaler', 'pred_min_max_scaler'], 
             group_labels=['Log scaler', 'Min_max scaler'], 
             title='Densidad de los residuales test', 
             filename='Residuals_density_02')
Out[35]:
In [36]:
df_train = df_train.assign(res_log_scaler=np.abs(df_train['Mm_DustTrack'] - df_train['pred_log_scaler']))
df_train = df_train.assign(res_min_max_scaler=np.abs(df_train['Mm_DustTrack'] - df_train['pred_min_max_scaler']))
df_test = df_test.assign(res_log_scaler=np.abs(df_test['Mm_DustTrack'] - df_test['pred_log_scaler']))
df_test = df_test.assign(res_min_max_scaler=np.abs(df_test['Mm_DustTrack'] - df_test['pred_min_max_scaler']))
df_start = df_start.assign(res_log_scaler=np.abs(df_start['Valor DustTrack'] - df_start['pred_log_scaler']))
df_start = df_start.assign(res_min_max_scaler=np.abs(df_start['Valor DustTrack'] - df_start['pred_min_max_scaler']))
In [37]:
step = 0.05
quantile_train = df_train[['res_log_scaler', 'res_min_max_scaler']].quantile(np.arange(0, 1 + step, step))
quantile_train.columns = [['res_log_scaler_train', 'res_min_max_scaler_train']]
quantile_test = df_test[['res_log_scaler', 'res_min_max_scaler']].quantile(np.arange(0, 1 + step, step))
quantile_test.columns = [['res_log_scaler_test', 'res_min_max_scaler_test']]
quantile_start = df_start[['res_log_scaler', 'res_min_max_scaler']].quantile(np.arange(0, 1 + step, step))
quantile_start.columns = [['res_log_scaler_original_data', 'res_min_max_scaler_original_data']]
df_quantile = pd.concat([quantile_train, quantile_test, quantile_start], axis=1)
df_quantile
Out[37]:
res_log_scaler_train res_min_max_scaler_train res_log_scaler_test res_min_max_scaler_test res_log_scaler_original_data res_min_max_scaler_original_data
0.00 0.006140 0.007205 0.003396 0.014924 0.000484 0.000322
0.05 0.883976 1.332992 1.015330 1.379544 1.038645 1.300217
0.10 1.844173 2.560733 1.933170 2.715016 2.190064 2.694341
0.15 2.900505 3.828050 2.932513 3.907760 3.353153 4.089009
0.20 3.965462 5.029690 4.046841 5.182302 4.593960 5.394446
0.25 5.116023 6.227400 5.177273 6.320191 5.809936 6.787925
0.30 6.317736 7.329271 6.461555 7.546718 7.231849 8.335685
0.35 7.585589 8.544034 7.766379 8.734637 8.677180 10.000322
0.40 8.815003 9.795220 8.971979 10.011467 10.320686 11.458650
0.45 10.273383 11.274136 10.295546 11.329072 11.923693 13.153313
0.50 11.754673 12.788696 11.999226 12.767898 13.944236 15.059084
0.55 13.359369 14.370286 13.682751 14.465869 16.320686 17.029058
0.60 15.266871 16.118046 15.711184 16.228732 18.961427 19.608375
0.65 17.286726 18.011233 17.548850 18.228058 21.858249 22.723722
0.70 20.199004 20.547473 19.961648 20.857797 25.695427 26.240811
0.75 23.187883 23.350032 23.265220 23.871672 30.090239 30.941561
0.80 26.926752 27.228856 26.989612 27.159494 37.041599 37.212647
0.85 32.574526 32.517703 32.055632 32.618790 46.956124 45.278274
0.90 40.724674 39.561795 40.177895 40.241041 62.691325 59.762197
0.95 53.243903 50.644692 54.098914 50.068867 89.133679 85.841765
1.00 164.389119 166.757485 165.180306 167.689248 2834.534580 2869.411653

Ecuación final de modelamiento (transformaión Log10)

La ecuación final que mejor se comporta para el modelamiento de los valores de Valor DustTrack a partir de los datos de Valor Honeywell es:

$$y = 10^{1.1675 * log_{10}(x) + 0.4173}$$

Donde X es el valor de la variable Valor Honeywell y y el valor de la variable Valor DustTrack m

Conclusiones finales

  • Es posible hacer una reconstrucción no perfecta pero si bastante buena de los datos del sensor DustTrack con base en los datos del sensor Honeywell.
In [ ]: