#!/usr/bin/env python
# coding: utf-8

# > This is one of the 100 recipes of the [IPython Cookbook](http://ipython-books.github.io/), the definitive guide to high-performance scientific computing and data science in Python.
# 

# # 4.11. Manipulating large heterogeneous tables with HDF5 and PyTables

# In[ ]:


import numpy as np
import tables as tb


# We create a new HDF5 file.

# In[ ]:


f = tb.open_file('myfile.h5', 'w')


# We will create a HDF5 table with two columns: the name of a city (a string with 64 characters at most), and its population (a 32 bit integer).

# In[ ]:


dtype = np.dtype([('city', 'S64'), ('population', 'i4')])


# Now, we create the table in '/table1'.

# In[ ]:


table = f.create_table('/', 'table1', dtype)


# Let's add a few rows.

# In[ ]:


table.append([('Brussels', 1138854),
              ('London',   8308369),
              ('Paris',    2243833)])


# After adding rows, we need to flush the table to commit the changes on disk.

# In[ ]:


table.flush()


# Data can be obtained from the table with a lot of different ways in PyTables. The easiest but less efficient way is to load the entire table in memory, which returns a NumPy array.

# In[ ]:


table[:]


# It is also possible to load a particular column (and all rows).

# In[ ]:


table.col('city')


# When dealing with a large number of rows, we can make a SQL-like query in the table to load all rows that satisfy particular conditions.

# In[ ]:


[row['city'] for row in table.where('population>2e6')]


# Finally, we can access particular rows knowing their indices.

# In[ ]:


table[1]


# Clean-up.

# In[ ]:


f.close()
import os
os.remove('myfile.h5')


# > You'll find all the explanations, figures, references, and much more in the book (to be released later this summer).
# 
# > [IPython Cookbook](http://ipython-books.github.io/), by [Cyrille Rossant](http://cyrille.rossant.net), Packt Publishing, 2014 (500 pages).