TileDB Test¶

Imports¶

In [1]:

import pandas as pd
import numpy as np
import tiledb
import requests
import io

Reading in Data¶

We'll start by reading in data on the UL electricity generation between 2015 and 2019

In [2]:

#df_FUELHH_raw = pd.read_csv('https://raw.githubusercontent.com/AyrtonB/Sage/master/data/elec_data.csv')
df_FUELHH_raw = pd.read_csv('elec_data.csv')

df_FUELHH = df_FUELHH_raw.drop(columns=['Unnamed: 0']).copy().astype(np.int32)
df_FUELHH.index = pd.to_datetime(df_FUELHH_raw['Unnamed: 0'])
df_FUELHH.index.name = 'datetime'

df_FUELHH.head()

Out[2]:

	ccgt	oil	coal	nuclear	wind	ps	npshyd	ocgt	other	intfr	intirl	intned	intew	biomass	intnem
datetime
2015-02-22 00:00:00	5293	0	12980	7777	1456	0	681	0	784	1496	0	1002	0	0	0
2015-02-22 00:30:00	5547	0	12947	7772	1401	0	682	0	784	1496	0	1028	0	0	0
2015-02-22 01:00:00	5168	0	13314	7779	1604	0	594	0	782	1496	0	982	0	0	0
2015-02-22 01:30:00	4498	0	13421	7768	1548	0	592	0	781	1496	0	1010	0	0	0
2015-02-22 02:00:00	4660	0	13304	7778	1351	0	584	0	780	1496	0	1016	0	0	0

We'll quickly check the datatypes

In [3]:

df_FUELHH.dtypes

Out[3]:

ccgt       int32
oil        int32
coal       int32
nuclear    int32
wind       int32
ps         int32
npshyd     int32
ocgt       int32
other      int32
intfr      int32
intirl     int32
intned     int32
intew      int32
biomass    int32
intnem     int32
dtype: object

Creating TileDB Array¶

Now we want to save the dataframe to a TileDB array.

To do this we'll start by defining the domain of the array. Ideally this will a datetime domain with cells every half hour, not knowing the best approach to achieve non-standard time units minutes are used instead.

In [4]:

TileDB_dir = 'data/TileDB'
array_name = 'FUELHH'

#array_dir = f'{TileDB_dir}/{array_name}'
array_dir = array_name

if tiledb.VFS().is_dir(array_dir):
    tiledb.remove(array_dir)

dom = tiledb.Domain(tiledb.Dim(name='datetime', 
                               domain=(np.datetime64('2015-02-22T00:00'), np.datetime64('2019-10-01T00:00')), 
                               tile=525600, # Minutes in a year 
                               dtype='datetime64[ms]'))

dom

Out[4]:

Domain(Dim(name='datetime', domain=(numpy.datetime64('2015-02-22T00:00:00.000'), numpy.datetime64('2019-10-01T00:00:00.000')), tile=525600 milliseconds, dtype='datetime64[ms]'))

We'll now define the schema, having a separate attribute for each fuel type

In [5]:

FUELHH_col_order = ['ccgt', 'oil', 'coal', 'nuclear', 'wind', 'ps', 'npshyd', 'ocgt', 'other', 'intfr', 'intirl', 'intned', 'intew', 'biomass', 'intnem']

attrs = [tiledb.Attr(name=fuel, dtype=np.int32) for fuel in FUELHH_col_order]
schema = tiledb.ArraySchema(domain=dom, sparse=True, attrs=attrs)

schema

Out[5]:

ArraySchema(
  domain=Domain(*[
    Dim(name='datetime', domain=(numpy.datetime64('2015-02-22T00:00:00.000'), numpy.datetime64('2019-10-01T00:00:00.000')), tile=525600 milliseconds, dtype='datetime64[ms]'),
  ]),
  attrs=[
    Attr(name='ccgt', dtype='int32'),
    Attr(name='oil', dtype='int32'),
    Attr(name='coal', dtype='int32'),
    Attr(name='nuclear', dtype='int32'),
    Attr(name='wind', dtype='int32'),
    Attr(name='ps', dtype='int32'),
    Attr(name='npshyd', dtype='int32'),
    Attr(name='ocgt', dtype='int32'),
    Attr(name='other', dtype='int32'),
    Attr(name='intfr', dtype='int32'),
    Attr(name='intirl', dtype='int32'),
    Attr(name='intned', dtype='int32'),
    Attr(name='intew', dtype='int32'),
    Attr(name='biomass', dtype='int32'),
    Attr(name='intnem', dtype='int32'),
  ],
  cell_order='row-major',
  tile_order='row-major', sparse=True)
# note: filters omitted

We're now ready to initialise our array

In [6]:

#tiledb.Array.create(array_dir.replace('/', '\\'), schema)
tiledb.Array.create(array_dir, schema)

We'll put the values into a dictionary of value arrays, with each key mapping to an attribute in the TileDB array

In [7]:

attr_data_dict = df_FUELHH.T.apply(np.array, axis=1).to_dict()

attr_data_dict

Out[7]:

{'ccgt': array([5293, 5547, 5168, ..., 4502, 3893, 4460], dtype=int32),
 'oil': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'coal': array([12980, 12947, 13314, ...,     0,     0,     0], dtype=int32),
 'nuclear': array([7777, 7772, 7779, ..., 6339, 6336, 6319], dtype=int32),
 'wind': array([1456, 1401, 1604, ..., 7933, 8228, 7995], dtype=int32),
 'ps': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'npshyd': array([681, 682, 594, ..., 411, 408, 400], dtype=int32),
 'ocgt': array([0, 0, 0, ..., 1, 0, 0], dtype=int32),
 'other': array([784, 784, 782, ..., 101, 101,  99], dtype=int32),
 'intfr': array([1496, 1496, 1496, ...,  910,  912,  812], dtype=int32),
 'intirl': array([  0,   0,   0, ..., 160, 160, 160], dtype=int32),
 'intned': array([1002, 1028,  982, ...,    0,    0,    0], dtype=int32),
 'intew': array([  0,   0,   0, ..., 120, 112, 118], dtype=int32),
 'biomass': array([   0,    0,    0, ..., 1635, 1605, 1502], dtype=int32),
 'intnem': array([  0,   0,   0, ..., 324, 328, 562], dtype=int32)}

Finally we can save the array

This step is failing, I believe due to the original issue around using non-standard datetime steps in the domain

In [8]:

with tiledb.open(array_dir.replace('/', '\\'), 'w') as TileDB_array:
    TileDB_array[df_FUELHH.index.values] = attr_data_dict

In [9]:

for k,v in attr_data_dict.items():
    print(k, v.shape)

ccgt (80737,)
oil (80737,)
coal (80737,)
nuclear (80737,)
wind (80737,)
ps (80737,)
npshyd (80737,)
ocgt (80737,)
other (80737,)
intfr (80737,)
intirl (80737,)
intned (80737,)
intew (80737,)
biomass (80737,)
intnem (80737,)

In [10]:

with tiledb.open(array_dir.replace('/', '\\'), 'r') as TileDB_array:
    read_in_vals = TileDB_array[:]
    
read_in_vals

Out[10]:

OrderedDict([('ccgt',
              array([5293, 5547, 5168, ..., 4502, 3893, 4460], dtype=int32)),
             ('oil', array([0, 0, 0, ..., 0, 0, 0], dtype=int32)),
             ('coal',
              array([12980, 12947, 13314, ...,     0,     0,     0], dtype=int32)),
             ('nuclear',
              array([7777, 7772, 7779, ..., 6339, 6336, 6319], dtype=int32)),
             ('wind',
              array([1456, 1401, 1604, ..., 7933, 8228, 7995], dtype=int32)),
             ('ps', array([0, 0, 0, ..., 0, 0, 0], dtype=int32)),
             ('npshyd',
              array([681, 682, 594, ..., 411, 408, 400], dtype=int32)),
             ('ocgt', array([0, 0, 0, ..., 1, 0, 0], dtype=int32)),
             ('other',
              array([784, 784, 782, ..., 101, 101,  99], dtype=int32)),
             ('intfr',
              array([1496, 1496, 1496, ...,  910,  912,  812], dtype=int32)),
             ('intirl',
              array([  0,   0,   0, ..., 160, 160, 160], dtype=int32)),
             ('intned',
              array([1002, 1028,  982, ...,    0,    0,    0], dtype=int32)),
             ('intew',
              array([  0,   0,   0, ..., 120, 112, 118], dtype=int32)),
             ('biomass',
              array([   0,    0,    0, ..., 1635, 1605, 1502], dtype=int32)),
             ('intnem',
              array([  0,   0,   0, ..., 324, 328, 562], dtype=int32)),
             ('datetime',
              array(['2015-02-22T00:00:00.000', '2015-02-22T00:30:00.000',
                     '2015-02-22T01:00:00.000', ..., '2019-09-30T23:00:00.000',
                     '2019-09-30T23:30:00.000', '2019-10-01T00:00:00.000'],
                    dtype='datetime64[ms]'))])

In [11]:

# the OrderedDict is convertible to a dataframe
idx = read_in_vals.pop('datetime')
pd.DataFrame(read_in_vals, index=idx)

Out[11]:

	ccgt	oil	coal	nuclear	wind	ps	npshyd	ocgt	other	intfr	intirl	intned	intew	biomass	intnem
2015-02-22 00:00:00	5293	0	12980	7777	1456	0	681	0	784	1496	0	1002	0	0	0
2015-02-22 00:30:00	5547	0	12947	7772	1401	0	682	0	784	1496	0	1028	0	0	0
2015-02-22 01:00:00	5168	0	13314	7779	1604	0	594	0	782	1496	0	982	0	0	0
2015-02-22 01:30:00	4498	0	13421	7768	1548	0	592	0	781	1496	0	1010	0	0	0
2015-02-22 02:00:00	4660	0	13304	7778	1351	0	584	0	780	1496	0	1016	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2019-09-30 22:00:00	6740	0	0	6418	7627	0	453	0	103	1210	80	110	58	1636	0
2019-09-30 22:30:00	4614	0	0	6408	7725	0	432	0	103	1594	160	114	204	1643	0
2019-09-30 23:00:00	4502	0	0	6339	7933	0	411	1	101	910	160	0	120	1635	324
2019-09-30 23:30:00	3893	0	0	6336	8228	0	408	0	101	912	160	0	112	1605	328
2019-10-01 00:00:00	4460	0	0	6319	7995	0	400	0	99	812	160	0	118	1502	562

80737 rows × 15 columns

In [ ]: