#!/usr/bin/env python # coding: utf-8 # > This is one of the 100 recipes of the [IPython Cookbook](http://ipython-books.github.io/), the definitive guide to high-performance scientific computing and data science in Python. # # # 4.11. Manipulating large heterogeneous tables with HDF5 and PyTables # In[ ]: import numpy as np import tables as tb # We create a new HDF5 file. # In[ ]: f = tb.open_file('myfile.h5', 'w') # We will create a HDF5 table with two columns: the name of a city (a string with 64 characters at most), and its population (a 32 bit integer). # In[ ]: dtype = np.dtype([('city', 'S64'), ('population', 'i4')]) # Now, we create the table in '/table1'. # In[ ]: table = f.create_table('/', 'table1', dtype) # Let's add a few rows. # In[ ]: table.append([('Brussels', 1138854), ('London', 8308369), ('Paris', 2243833)]) # After adding rows, we need to flush the table to commit the changes on disk. # In[ ]: table.flush() # Data can be obtained from the table with a lot of different ways in PyTables. The easiest but less efficient way is to load the entire table in memory, which returns a NumPy array. # In[ ]: table[:] # It is also possible to load a particular column (and all rows). # In[ ]: table.col('city') # When dealing with a large number of rows, we can make a SQL-like query in the table to load all rows that satisfy particular conditions. # In[ ]: [row['city'] for row in table.where('population>2e6')] # Finally, we can access particular rows knowing their indices. # In[ ]: table[1] # Clean-up. # In[ ]: f.close() import os os.remove('myfile.h5') # > You'll find all the explanations, figures, references, and much more in the book (to be released later this summer). # # > [IPython Cookbook](http://ipython-books.github.io/), by [Cyrille Rossant](http://cyrille.rossant.net), Packt Publishing, 2014 (500 pages).