In [61]:
import pandas as pd
import altair as alt

Setting parameters

In [2]:
URL = 'https://raw.githubusercontent.com/Datenspieler/notebooks_for_blog/master/2020-treibstoff/treibstoff.csv'

Getting data from Github

In [8]:
df = pd.read_csv(URL)
df.date = pd.to_datetime(df.date)
In [9]:
df.tail()
Out[9]:
date id price type
7288 2020-07-21 00:20:02 218712 0.999 SUP
7289 2020-07-21 00:20:02 224519 1.003 SUP
7290 2020-07-21 00:20:02 1354761 1.004 SUP
7291 2020-07-21 00:20:02 665614 1.013 SUP
7292 2020-07-21 00:20:02 1354799 1.013 SUP
In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7293 entries, 0 to 7292
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    7293 non-null   datetime64[ns]
 1   id      7293 non-null   int64         
 2   price   7293 non-null   float64       
 3   type    7293 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 228.0+ KB
In [62]:
print(df.date.min())
print(df.date.max())
2020-06-23 22:20:02
2020-07-21 00:20:02

Preparing data

Add price (min and mean) over time

In [11]:
df['min'] = df.groupby(['type', 'date'])[['price']].transform('min')
df['mean'] = df.groupby(['type', 'date'])[['price']].transform('mean')

Dropping duplicate data

Since for this analysis only interested in minimum/mean price, not what gas station is offering that price, can drop a lot of data.

In [18]:
df = df[['date', 'type', 'min', 'mean']].drop_duplicates()
In [19]:
df.head(3)
Out[19]:
date type min mean
0 2020-06-23 22:20:02 DIE 0.949 0.9546
5 2020-06-23 22:20:02 SUP 0.999 1.0046
10 2020-06-23 23:20:01 DIE 0.949 0.9546

First look at the data

In [31]:
chart_zero = alt.Chart(df).mark_line().encode(
    x='date',
    y=alt.Y(alt.repeat("column"), type='quantitative'),
    color='type',
    strokeDash='type',
).repeat(
    column=['min', 'mean']
)

chart_zero
Out[31]:

Some movement, escepcially if we look at the mean price. We can make it look more spectacular, if we don't let the price start at 0 - great trick and bad visualization practive at the same time 😉

In [36]:
chart_not_zero = alt.Chart(df).mark_line().encode(
    x='date',
    y=alt.Y(alt.repeat("column"), type='quantitative', scale=alt.Scale(zero=False)),
    color='type',
    strokeDash='type',
).repeat(
    column=['min', 'mean']
)

chart_not_zero
Out[36]:

Analysis on day of week and hour of day

Add rolling average

To 'correct' for long term trend, let's take the 1 week (24 x 7 measurements) average.

In [39]:
df1 = df.set_index(['type', 'date']).unstack('type')
df1.columns = ['_'.join(col[::-1]).strip() for col in df1.columns.values]
df1.head()
Out[39]:
DIE_min SUP_min DIE_mean SUP_mean
date
2020-06-23 22:20:02 0.949 0.999 0.954600 1.004600
2020-06-23 23:20:01 0.949 0.999 0.954600 1.004600
2020-06-24 00:20:02 0.949 0.999 0.958857 1.008857
2020-06-24 01:20:02 0.949 0.999 0.958857 1.008857
2020-06-24 02:20:01 0.949 0.999 0.958857 1.008857

Look at difference from average, switch to cent values

In [51]:
df_delta = df1 - df1.rolling(7*24, center = True).mean()
df_delta = df_delta * 100 #Let's switch to Cent values
df_delta.head() # of course the first entries are NaN
Out[51]:
DIE_min SUP_min DIE_mean SUP_mean
date
2020-06-23 22:20:02 NaN NaN NaN NaN
2020-06-23 23:20:01 NaN NaN NaN NaN
2020-06-24 00:20:02 NaN NaN NaN NaN
2020-06-24 01:20:02 NaN NaN NaN NaN
2020-06-24 02:20:01 NaN NaN NaN NaN
In [52]:
df_delta.tail()
Out[52]:
DIE_min SUP_min DIE_mean SUP_mean
date
2020-07-20 20:20:01 NaN NaN NaN NaN
2020-07-20 21:20:02 NaN NaN NaN NaN
2020-07-20 22:20:01 NaN NaN NaN NaN
2020-07-20 23:20:02 NaN NaN NaN NaN
2020-07-21 00:20:02 NaN NaN NaN NaN
In [53]:
df_delta.describe()
Out[53]:
DIE_min SUP_min DIE_mean SUP_mean
count 484.000000 484.000000 484.000000 484.000000
mean -0.006018 -0.055542 -0.034468 -0.071316
std 0.523949 0.631228 0.824300 1.033422
min -1.725000 -1.467262 -2.210951 -2.151137
25% -0.088690 -0.322917 -0.418081 -0.688466
50% 0.080952 0.155357 -0.061518 -0.036823
75% 0.200000 0.356250 0.280230 0.390915
max 0.989286 0.911310 2.885819 3.697436

Prepare data for plots

In [54]:
df_delta = df_delta.reset_index()
In [55]:
df_chart = df_delta.groupby([df_delta.date.dt.day_name(), df_delta.date.dt.hour]).mean()#.stack()
df_chart.index.names = ['Day of week', 'Hour']
df_chart
Out[55]:
DIE_min SUP_min DIE_mean SUP_mean
Day of week Hour
Friday 0 0.006151 0.375000 0.063581 0.399897
1 -0.001190 0.373016 0.059726 0.398581
2 -0.008532 0.371032 0.053542 0.397589
3 -0.015873 0.369048 0.046916 0.396549
4 -0.023214 0.367063 0.039693 0.392836
... ... ... ... ... ...
Wednesday 19 0.201984 0.295040 -0.018503 -0.056196
20 0.195833 0.292659 0.023958 -0.058444
21 0.189683 0.290278 0.030862 0.063117
22 0.183532 0.287698 -0.037788 0.366133
23 0.177381 0.285119 -0.044204 0.363513

168 rows × 4 columns

In [56]:
df_chart = df_chart.reset_index()
df_chart.describe()
Out[56]:
Hour DIE_min SUP_min DIE_mean SUP_mean
count 168.000000 168.000000 168.000000 168.000000 168.000000
mean 11.500000 0.014533 -0.035066 -0.018058 -0.054979
std 6.942881 0.462745 0.542503 0.747398 0.945292
min 0.000000 -1.290079 -1.226190 -1.605849 -1.867506
25% 5.750000 -0.003026 -0.367857 -0.307451 -0.750507
50% 11.500000 0.129663 0.173810 -0.071218 0.063262
75% 17.250000 0.289087 0.359623 0.311333 0.393182
max 23.000000 0.866667 0.507143 2.326624 2.936946
In [60]:
df_chart.select_dtypes('number').max() - df_chart.select_dtypes('number').min()
Out[60]:
Hour        23.000000
DIE_min      2.156746
SUP_min      1.733333
DIE_mean     3.932473
SUP_mean     4.804451
dtype: float64
In [57]:
for (col, title_part) in [('DIE_min','diesel, cheapest gas station') , 
                          ('DIE_mean', 'diesel, mean of 5 cheapest gas stations'),
                          ('SUP_min','premium gas, cheapest gas station') , 
                          ('SUP_mean', 'premium gas, mean of 5 cheapest gas stations')]:
    chart = alt.Chart(df_chart).mark_rect().encode(
        x='Hour:O',
        y=alt.Y('Day of week:O', sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']),
        color=alt.Color(col, type='quantitative', title='Cent', 
                    scale=alt.Scale(scheme='redyellowgreen', domainMid = 0, domain = [-2, 2], reverse=True))
    ).properties(
        title='Difference in gasoline price - ' + title_part
    )
    chart.save(col + '.svg')
    display(chart)
In [ ]:
 
In [ ]:
 
In [ ]: