TileDB Test


Imports

In [1]:
import pandas as pd
import numpy as np
import tiledb
import requests
import io


Reading in Data

We'll start by reading in data on the UL electricity generation between 2015 and 2019

In [2]:
#df_FUELHH_raw = pd.read_csv('https://raw.githubusercontent.com/AyrtonB/Sage/master/data/elec_data.csv')
df_FUELHH_raw = pd.read_csv('elec_data.csv')

df_FUELHH = df_FUELHH_raw.drop(columns=['Unnamed: 0']).copy().astype(np.int32)
df_FUELHH.index = pd.to_datetime(df_FUELHH_raw['Unnamed: 0'])
df_FUELHH.index.name = 'datetime'

df_FUELHH.head()
Out[2]:
ccgt oil coal nuclear wind ps npshyd ocgt other intfr intirl intned intew biomass intnem
datetime
2015-02-22 00:00:00 5293 0 12980 7777 1456 0 681 0 784 1496 0 1002 0 0 0
2015-02-22 00:30:00 5547 0 12947 7772 1401 0 682 0 784 1496 0 1028 0 0 0
2015-02-22 01:00:00 5168 0 13314 7779 1604 0 594 0 782 1496 0 982 0 0 0
2015-02-22 01:30:00 4498 0 13421 7768 1548 0 592 0 781 1496 0 1010 0 0 0
2015-02-22 02:00:00 4660 0 13304 7778 1351 0 584 0 780 1496 0 1016 0 0 0


We'll quickly check the datatypes

In [3]:
df_FUELHH.dtypes
Out[3]:
ccgt       int32
oil        int32
coal       int32
nuclear    int32
wind       int32
ps         int32
npshyd     int32
ocgt       int32
other      int32
intfr      int32
intirl     int32
intned     int32
intew      int32
biomass    int32
intnem     int32
dtype: object


Creating TileDB Array

Now we want to save the dataframe to a TileDB array.

To do this we'll start by defining the domain of the array. Ideally this will a datetime domain with cells every half hour, not knowing the best approach to achieve non-standard time units minutes are used instead.

In [4]:
TileDB_dir = 'data/TileDB'
array_name = 'FUELHH'

#array_dir = f'{TileDB_dir}/{array_name}'
array_dir = array_name

if tiledb.VFS().is_dir(array_dir):
    tiledb.remove(array_dir)

dom = tiledb.Domain(tiledb.Dim(name='datetime', 
                               domain=(np.datetime64('2015-02-22T00:00'), np.datetime64('2019-10-01T00:00')), 
                               tile=525600, # Minutes in a year 
                               dtype='datetime64[ms]'))

dom
Out[4]:
Domain(Dim(name='datetime', domain=(numpy.datetime64('2015-02-22T00:00:00.000'), numpy.datetime64('2019-10-01T00:00:00.000')), tile=525600 milliseconds, dtype='datetime64[ms]'))


We'll now define the schema, having a separate attribute for each fuel type

In [5]:
FUELHH_col_order = ['ccgt', 'oil', 'coal', 'nuclear', 'wind', 'ps', 'npshyd', 'ocgt', 'other', 'intfr', 'intirl', 'intned', 'intew', 'biomass', 'intnem']

attrs = [tiledb.Attr(name=fuel, dtype=np.int32) for fuel in FUELHH_col_order]
schema = tiledb.ArraySchema(domain=dom, sparse=True, attrs=attrs)

schema
Out[5]:
ArraySchema(
  domain=Domain(*[
    Dim(name='datetime', domain=(numpy.datetime64('2015-02-22T00:00:00.000'), numpy.datetime64('2019-10-01T00:00:00.000')), tile=525600 milliseconds, dtype='datetime64[ms]'),
  ]),
  attrs=[
    Attr(name='ccgt', dtype='int32'),
    Attr(name='oil', dtype='int32'),
    Attr(name='coal', dtype='int32'),
    Attr(name='nuclear', dtype='int32'),
    Attr(name='wind', dtype='int32'),
    Attr(name='ps', dtype='int32'),
    Attr(name='npshyd', dtype='int32'),
    Attr(name='ocgt', dtype='int32'),
    Attr(name='other', dtype='int32'),
    Attr(name='intfr', dtype='int32'),
    Attr(name='intirl', dtype='int32'),
    Attr(name='intned', dtype='int32'),
    Attr(name='intew', dtype='int32'),
    Attr(name='biomass', dtype='int32'),
    Attr(name='intnem', dtype='int32'),
  ],
  cell_order='row-major',
  tile_order='row-major', sparse=True)
# note: filters omitted


We're now ready to initialise our array

In [6]:
#tiledb.Array.create(array_dir.replace('/', '\\'), schema)
tiledb.Array.create(array_dir, schema)


We'll put the values into a dictionary of value arrays, with each key mapping to an attribute in the TileDB array

In [7]:
attr_data_dict = df_FUELHH.T.apply(np.array, axis=1).to_dict()

attr_data_dict
Out[7]:
{'ccgt': array([5293, 5547, 5168, ..., 4502, 3893, 4460], dtype=int32),
 'oil': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'coal': array([12980, 12947, 13314, ...,     0,     0,     0], dtype=int32),
 'nuclear': array([7777, 7772, 7779, ..., 6339, 6336, 6319], dtype=int32),
 'wind': array([1456, 1401, 1604, ..., 7933, 8228, 7995], dtype=int32),
 'ps': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'npshyd': array([681, 682, 594, ..., 411, 408, 400], dtype=int32),
 'ocgt': array([0, 0, 0, ..., 1, 0, 0], dtype=int32),
 'other': array([784, 784, 782, ..., 101, 101,  99], dtype=int32),
 'intfr': array([1496, 1496, 1496, ...,  910,  912,  812], dtype=int32),
 'intirl': array([  0,   0,   0, ..., 160, 160, 160], dtype=int32),
 'intned': array([1002, 1028,  982, ...,    0,    0,    0], dtype=int32),
 'intew': array([  0,   0,   0, ..., 120, 112, 118], dtype=int32),
 'biomass': array([   0,    0,    0, ..., 1635, 1605, 1502], dtype=int32),
 'intnem': array([  0,   0,   0, ..., 324, 328, 562], dtype=int32)}


Finally we can save the array

This step is failing, I believe due to the original issue around using non-standard datetime steps in the domain

In [8]:
with tiledb.open(array_dir.replace('/', '\\'), 'w') as TileDB_array:
    TileDB_array[df_FUELHH.index.values] = attr_data_dict
In [9]:
for k,v in attr_data_dict.items():
    print(k, v.shape)
ccgt (80737,)
oil (80737,)
coal (80737,)
nuclear (80737,)
wind (80737,)
ps (80737,)
npshyd (80737,)
ocgt (80737,)
other (80737,)
intfr (80737,)
intirl (80737,)
intned (80737,)
intew (80737,)
biomass (80737,)
intnem (80737,)
In [10]:
with tiledb.open(array_dir.replace('/', '\\'), 'r') as TileDB_array:
    read_in_vals = TileDB_array[:]
    
read_in_vals
Out[10]:
OrderedDict([('ccgt',
              array([5293, 5547, 5168, ..., 4502, 3893, 4460], dtype=int32)),
             ('oil', array([0, 0, 0, ..., 0, 0, 0], dtype=int32)),
             ('coal',
              array([12980, 12947, 13314, ...,     0,     0,     0], dtype=int32)),
             ('nuclear',
              array([7777, 7772, 7779, ..., 6339, 6336, 6319], dtype=int32)),
             ('wind',
              array([1456, 1401, 1604, ..., 7933, 8228, 7995], dtype=int32)),
             ('ps', array([0, 0, 0, ..., 0, 0, 0], dtype=int32)),
             ('npshyd',
              array([681, 682, 594, ..., 411, 408, 400], dtype=int32)),
             ('ocgt', array([0, 0, 0, ..., 1, 0, 0], dtype=int32)),
             ('other',
              array([784, 784, 782, ..., 101, 101,  99], dtype=int32)),
             ('intfr',
              array([1496, 1496, 1496, ...,  910,  912,  812], dtype=int32)),
             ('intirl',
              array([  0,   0,   0, ..., 160, 160, 160], dtype=int32)),
             ('intned',
              array([1002, 1028,  982, ...,    0,    0,    0], dtype=int32)),
             ('intew',
              array([  0,   0,   0, ..., 120, 112, 118], dtype=int32)),
             ('biomass',
              array([   0,    0,    0, ..., 1635, 1605, 1502], dtype=int32)),
             ('intnem',
              array([  0,   0,   0, ..., 324, 328, 562], dtype=int32)),
             ('datetime',
              array(['2015-02-22T00:00:00.000', '2015-02-22T00:30:00.000',
                     '2015-02-22T01:00:00.000', ..., '2019-09-30T23:00:00.000',
                     '2019-09-30T23:30:00.000', '2019-10-01T00:00:00.000'],
                    dtype='datetime64[ms]'))])
In [11]:
# the OrderedDict is convertible to a dataframe
idx = read_in_vals.pop('datetime')
pd.DataFrame(read_in_vals, index=idx)
Out[11]:
ccgt oil coal nuclear wind ps npshyd ocgt other intfr intirl intned intew biomass intnem
2015-02-22 00:00:00 5293 0 12980 7777 1456 0 681 0 784 1496 0 1002 0 0 0
2015-02-22 00:30:00 5547 0 12947 7772 1401 0 682 0 784 1496 0 1028 0 0 0
2015-02-22 01:00:00 5168 0 13314 7779 1604 0 594 0 782 1496 0 982 0 0 0
2015-02-22 01:30:00 4498 0 13421 7768 1548 0 592 0 781 1496 0 1010 0 0 0
2015-02-22 02:00:00 4660 0 13304 7778 1351 0 584 0 780 1496 0 1016 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2019-09-30 22:00:00 6740 0 0 6418 7627 0 453 0 103 1210 80 110 58 1636 0
2019-09-30 22:30:00 4614 0 0 6408 7725 0 432 0 103 1594 160 114 204 1643 0
2019-09-30 23:00:00 4502 0 0 6339 7933 0 411 1 101 910 160 0 120 1635 324
2019-09-30 23:30:00 3893 0 0 6336 8228 0 408 0 101 912 160 0 112 1605 328
2019-10-01 00:00:00 4460 0 0 6319 7995 0 400 0 99 812 160 0 118 1502 562

80737 rows × 15 columns

In [ ]:
 
In [ ]: