In [1]:

from statsmodels.tsa.statespace.varmax import VARMAX

from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

We have app data that for each user pairs reported app metrics and app name:

In [2]:

appDf = pd.read_csv("app.data", names=["user", "date", "app", "metric"])
appDf['date'] = pd.to_datetime(appDf['date'])
appDf.info()
print(appDf.user.unique(), appDf.app.unique())
appDf.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 4 columns):
user      497 non-null object
date      497 non-null datetime64[ns]
app       497 non-null object
metric    497 non-null float64
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 15.6+ KB
['user_2' 'user_1' 'user_3'] [' app_2' ' app_1' ' app_3']

Out[2]:

	user	date	app	metric
0	user_2	2017-08-28 02:41:48	app_2	0.00
1	user_1	2017-08-28 11:01:01	app_1	0.01
2	user_3	2017-08-28 16:41:55	app_1	0.10
3	user_3	2017-08-29 02:43:39	app_3	0.09
4	user_1	2017-08-29 07:00:25	app_1	0.02

We have location data that uses keywords for location and indicates a change of location:

In [3]:

locationDf = pd.read_csv("location.data", names=["user", "date", "location"])
locationDf['date'] = pd.to_datetime(locationDf['date'])

cross_l = pd.crosstab([locationDf.date, locationDf.user], locationDf.location)

cross_l.head(3)

Out[3]:

	location	bar	girlfriends	grocers	home	lunch	park	relatives	work
date	user
2017-08-28 00:00:01	user_2	0	0	0	1	0	0	0	0
2017-08-28 00:00:01	user_3	0	0	0	1	0	0	0	0
2017-08-28 19:25:16	user_3	0	0	0	0	0	0	1	0

Location data includes seasonality at actual seasonal levels (summer, winter), as well as weekday/weekend behavior. Here this is demonstrated with cumsum:

In [4]:

cross = cross_l.copy()

l2 = cross.groupby(['user']).cumsum().copy()
l2.reset_index(inplace=True)

plotaxis = plt.figure(figsize=(50,20)).gca()
for key, grp in l2.groupby(['user']):
    my_ts = [ts.to_julian_date() - 1721424.5 for ts in grp['date']]
    plt.plot(my_ts, grp.drop('user', axis=1).drop('date', axis=1), label=key)

plotaxis.xaxis.set_major_formatter(
    matplotlib.dates.DateFormatter('%d/%m/%y')
)
xlabels = plotaxis.get_xticklabels()
plt.setp(xlabels, rotation=85, fontsize=25)
ylabels = plotaxis.get_yticklabels()
plt.setp(ylabels, fontsize=25)

plt.legend(bbox_to_anchor=(.02, 0.52, 1., .102), loc=3,
           ncol=2, borderaxespad=0., prop={'size': 26})    

#l3 = l2[l2['user'] == 'user_1'].copy()
#l3['month']=l3['date'].dt.month
#grouped = l3.groupby(l3['month'])
#
#for m in grouped.groups.keys():
#    print(l3[l3['month'] == m].tail(1))
#l2.groupby(['user']).sum()

'User/Locations'

Out[4]:

'User/Locations'

The some metric data is set to have positive or negative correlation in terms of growth, to location data. Excluding seasonality, the metric data should trend upwards:

In [5]:

#appDf.groupby(['user', 'app']).plot(x="date", y="metric", subplots=True)

plotaxis = plt.figure(figsize=(50,20)).gca()
for key, grp in appDf.groupby(['user', 'app']):
    my_ts = [ts.to_julian_date() - 1721424.5 for ts in grp['date']]
    plt.plot(my_ts, grp['metric'], label='%s@%s' % ("metric", key))

plotaxis.xaxis.set_major_formatter(
    matplotlib.dates.DateFormatter('%d/%m/%y')
)
xlabels = plotaxis.get_xticklabels()
plt.setp(xlabels, rotation=85, fontsize=25)
ylabels = plotaxis.get_yticklabels()
plt.setp(ylabels, fontsize=25)
'Users/Apps'

Out[5]:

'Users/Apps'

In the next plot, we can see in fine detail user_3's metric trend and location data. Looking at the very beginning at increased zoom, you might notice a visual correlation between the first few gaps and the faster-rising app data. Frequency of use should also positively correlate.

In [6]:

cross.reset_index(inplace=True)

u1_l = cross[cross['user'] == 'user_1']

u1_a = appDf[appDf['user'] == 'user_1']

u1_a1 = u1_a[u1_a['app'] == ' app_1']
u1_a2 = u1_a[u1_a['app'] == ' app_2']

plotaxis = plt.figure(figsize=(100,10)).gca()

for key, grp in u1_a.groupby(['app']):
    my_ts = [ts.to_julian_date() for ts in grp['date']]
    plt.plot(grp['date'], grp['metric'], '.-', label='%s@%s' % ("metric", key))    

for key, grp in u1_l.groupby(['user']):
    my_ts = [ts.to_julian_date() for ts in grp['date']]
    plt.plot(grp['date'], grp.drop('user', axis=1).drop('date', axis=1), label=key)
    
plotaxis.xaxis.set_major_formatter(
    matplotlib.dates.DateFormatter('%d/%m/%y')
)
xlabels = plotaxis.get_xticklabels()
plt.setp(xlabels, rotation=85, fontsize=25)
ylabels = plotaxis.get_yticklabels()
plt.setp(ylabels, fontsize=25)
True

Out[6]:

True

We want to produce classical machine learning/statistical modeling as a baseline to justify RNN approaches. We will follow this strategy in preparation of an ARIMA model:

resample to 15m intervals (data set would be insufficient to form the ARIMA model as the majority are errant occluding/masking/hiding the underlying predictive signal/model/equation/weighted average necessary to provide the baseline, so we will interpolate at the end)
fill rows in both set by date, so the data sets can be combined.
convert metric data to metric values per app: first creating a column for each app.
get column categories for location
divide data into per user charts.
combine app/metric and location data.
interpolate metric data per user to get curves we can predict from.

In [7]:

locDf = locationDf.copy()
locDf.set_index('date', inplace=True)

locDfs = {}
for user, user_loc_dc in locDf.groupby('user'):
    locDfs[user] = user_loc_dc.resample('15T').agg('max').bfill()
    
aDf = appDf.copy()
aDf.set_index('date', inplace=True)

userLocAppDfs = {}
for user, a2_df in aDf.groupby('user'):
    userDf = a2_df.resample('15T').agg('max')
                
    userDf.reset_index(inplace=True)
    userDf = pd.crosstab(index=userDf['date'], columns=userDf['app'], values=userDf['metric'], aggfunc=np.mean).fillna(np.nan, downcast='infer')

    userDf['user'] = user
        
    userDf.reset_index(inplace=True)
    userDf.set_index('date', inplace=True)

    userLocAppDfs[user] = userDf.resample('15T').agg('max')
            
    loDf = locDfs[user]
    loDf.reset_index(inplace=True)
    loDf = pd.crosstab([loDf.date, loDf.user], loDf.location)
    loDf.reset_index(inplace=True)

    loDf.set_index('date', inplace=True)
    loDf.drop('user', axis=1, inplace=True)
    
    userLocAppDfs[user] = userLocAppDfs[user].join(loDf, how='outer')
    userLocAppDfs[user] = userLocAppDfs[user].resample('15T').agg('max')
    userLocAppDfs[user]['user'].fillna(user, inplace=True)
    
    for loc in locationDf[locationDf['user'] == user].location.unique():
        userLocAppDfs[user][loc] = userLocAppDfs[user][loc].replace(np.nan, 0)
            
    for app in a2_df['app'].unique():
        #print(userLocAppDfs[user][app].head(10))
        #userLocAppDfs[user][app].fillna(value=0, inplace=True)
        #userLocAppDfs[user][app].replace(0.0, np.nan, inplace=True)

        userLocAppDfs[user][app].interpolate(method='linear', limit_area='inside', inplace=True)       
        userLocAppDfs[user][app].fillna(value=0, inplace=True)
        
#userLocAppDfs['user_1'].tail(5)

At this point, we have our data set per user. Here's what one user's data looks like:

In [8]:

u1 = userLocAppDfs['user_1'].copy()
u1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15011 entries, 2017-08-28 11:00:00 to 2018-01-31 19:30:00
Freq: 15T
Data columns (total 10 columns):
 app_1        15011 non-null float64
 app_2        15011 non-null float64
user          15011 non-null object
 bar          15011 non-null float64
 grocers      15011 non-null float64
 home         15011 non-null float64
 lunch        15011 non-null float64
 park         15011 non-null float64
 relatives    15011 non-null float64
 work         15011 non-null float64
dtypes: float64(9), object(1)
memory usage: 1.3+ MB

In [15]:

u1 = userLocAppDfs['user_1'].copy()
# https://stackoverflow.com/questions/11927715/how-to-give-a-pandas-matplotlib-bar-graph-custom-colors
locations = [(x/8.75, x/40.0, 0.85) for x in range(7)] # color grad

u1[[' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives']].plot(color=locations, figsize=(15,10))
#height = 1
#
## https://stackoverflow.com/questions/52467781/how-can-i-fill-the-area-in-this-plot/52467937#52467937
#fig, ax = plt.subplots(1,1)
#for i, location in enumerate([' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives']):
#    ax.fill_between(range(len(u1[location])), height * u1[location], step='post', color=locations[i])
                    
u1[' app_2'].plot(color='orange')
u1[' app_1'].plot(color='r')

#u1[135:140]

Out[15]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a26844128>

In [28]:

print(u1.info())
u1.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15011 entries, 2017-08-28 11:00:00 to 2018-01-31 19:30:00
Freq: 15T
Data columns (total 10 columns):
 app_1        15011 non-null float64
 app_2        15011 non-null float64
user          15011 non-null object
 bar          15011 non-null float64
 grocers      15011 non-null float64
 home         15011 non-null float64
 lunch        15011 non-null float64
 park         15011 non-null float64
 relatives    15011 non-null float64
 work         15011 non-null float64
dtypes: float64(9), object(1)
memory usage: 1.3+ MB
None

Out[28]:

	app_1	app_2	user	bar	grocers	home	lunch	park	relatives	work
date
2017-08-28 11:00:00	0.010000	0.0	user_1	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2017-08-28 11:15:00	0.010125	0.0	user_1	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2017-08-28 11:30:00	0.010250	0.0	user_1	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2017-08-28 11:45:00	0.010375	0.0	user_1	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2017-08-28 12:00:00	0.010500	0.0	user_1	0.0	0.0	0.0	0.0	0.0	0.0	0.0

okay, let's fit a model to data through oct 3 to see if we can predict the down-trend in app_1.

In [25]:

columns = [ ' app_1', ' app_2', ' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives' ]
series = u1[columns]

# from: https://machinelearningmastery.com/make-predictions-time-series-forecasting-python/
# create a difference transform of the dataset
def difference(dataset):
    diff = list()
    for i in range(1, len(dataset)):
        value = dataset[i] - dataset[i - 1]
        diff.append(value)
    return np.array(diff)

# Make a prediction give regression coefficients and lag obs
def predict(coef, history):
    yhat = coef[0]
    for i in range(1, len(coef)):
        yhat += coef[i] * history[-i]
    return yhat

X = pd.DataFrame()
for column in columns:
    X[column] = difference(series[column].values)

size = (4*24)*54 # hoping
train, test = X[0:size], X[size:size+(14*4*24)]

train = train.loc[:, (train != train.iloc[0]).any()] # https://stackoverflow.com/questions/20209600/panda-dataframe-remove-constant-column
test = test.loc[:, (test != test.iloc[0]).any()] # https://stackoverflow.com/questions/20209600/panda-dataframe-remove-constant-column

#print(train.var(), X.info())

# train autoregression
model = VARMAX(train)
model_fit = model.fit(method='powell', disp=False)
#print(model_fit.mle_retvals)

##window = model_fit.k_ar
coef = model_fit.params

# walk forward over time steps in test
history = [train.iloc[i] for i in range(len(train))]
predictions = list()
for t in range(len(test)):
    yhat = predict(coef, history)
    obs = test.iloc[t]
    predictions.append(yhat)
    history.append(obs)

In [26]:

print(mean_squared_error(test, predictions))


# plot
plt.plot(test)
plt.plot(predictions, color='red')
plt.show()

0.5594208989876831

In [ ]: