#!/usr/bin/env python # coding: utf-8 # # pandas I/O tools and examples # In[1]: import addutils.toc ; addutils.toc.js(ipy_notebook=True) # In[2]: from addutils import css_notebook css_notebook() # ## 1 Matlab Variables # ### 1.1 Import a Matlab variable from file # In[3]: import re import os import scipy.io import numpy as np import pandas as pd import sqlite3 import pandas.io.sql as psql from time import time from IPython.display import display from addutils import css_notebook css_notebook() # In[4]: import sys # Import from '.mat' files # In[5]: x = scipy.io.loadmat('example_data/matlab_variable.mat') # In[6]: pyA = x['a'] pyA # The Matlab variable is passed to a pandas DataFrame: # In[7]: df = pd.DataFrame(pyA) df # ## 2 Importing a compressed CSV # The following example shows how to import directly a compressed csv file, in this case with multiple separators: # In[8]: df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\,\;\.', skipinitialspace=True, compression='gzip', engine='python', encoding='iso8859_15') df_csv.head() # **TODO**: utf-8 has problems decoding euro sign. "\u00e2\u0082\u00ac" are the three chars you get when the UTF-8 encoded \u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1. # In[9]: coor = df_csv['Coordinates'] df_csv['lat'] = '' df_csv['lon'] = '' for j, coo in enumerate(coor): spl = re.split(',', str(coo)) df_csv.loc[j, 'lat'] = spl[0] df_csv.loc[j, 'lon'] = spl[1] df_csv.head() # In[10]: df_csv.to_json('temp/converted_json.json') # ## 3 Importing JSON files # In[11]: with open('temp/converted_json.json') as f: data = f.read().encode('utf-8') dfjson = pd.read_json(data) dfjson.head() # Since the import reordered columns in alphabetical order, we can choose a preferred column order: # In[12]: dfjson = dfjson.loc[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']] dfjson.head() # ## 4 Importing HTML # Note: `read_html` returns a **list** of DataFrame objects, even if there is only a single table contained in the `HTML` content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates) # In[13]: dfhtml = pd.read_html('example_data/generated.html', header=0) dfhtml[0].head() # ## 5 Importing Excel # In[14]: dfxl = pd.read_excel('example_data/generated2.xls', 'foglio') dfxl.head() # ## 6 Working with SQL and databases # ### 6.1 Write SQL # Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating: # In[15]: con = sqlite3.connect("temp.sql") sqlfile = dfxl.to_sql('Names', con) # ### 6.2 Import SQL # In[16]: con = sqlite3.connect('temp.sql') with con: sql = "SELECT * FROM Names;" df = psql.read_sql(sql, con) print(df.shape) # In[17]: df.head() # In[18]: con.close() os.remove("temp.sql") # ## 7 Working with HDF5 # ### 7.1 Storer format # **HDFStore** is a dict-like object used by pandas to store datasets as **HDF5** files using the **PyTables** library. **HDF5** is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The **Storer** format stores fixed arrays, which are queryiable and must be retrieved in their entirety. # Add DataFrames to the HDFStore object: # In[19]: samples_01 = int(3e5) samples_02 = int(1e5) idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome') idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome') randn = np.random.randn # In[20]: df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C']) df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D']) # In[21]: print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB') # In[22]: get_ipython().run_cell_magic('timeit', '', "with pd.HDFStore('temp/store53.h5') as store:\n store.put('storer/df1', df1)\n store.put('storer/df2', df2)\n store.put('to_remove', df2)\n") # Retrieve stored objects: # In[23]: with pd.HDFStore('temp/store53.h5') as store: print (store.keys()) df1_retrieved = store.get('storer/df1') print (df1_retrieved[1:3]) print (df1[1:3]) print ('Check retrieved data equal to original data: ') print (df1_retrieved[1:3].equals(df1[1:3])) # Delete objects: # In[24]: with pd.HDFStore('temp/store53.h5') as store: try: store.remove('to_remove') except: pass print (store) # ### 7.2 Table format # The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported. # In[25]: with pd.HDFStore('temp/store53.h5') as store: # store.append creates a table automatically: store.append('table/df1_appended', df1.iloc[:10000]) # In[26]: with pd.HDFStore('temp/store53.h5') as store: store.append('table/df1_appended', df1.iloc[10001:20000]) store.append('table/df1_appended', df1.iloc[20001:50000]) # ### 7.3 Querying a Table # Query the table using boolean expression with in-line function evaluation: # In[27]: with pd.HDFStore('temp/store53.h5') as store: query01 = store.select('table/df1_appended', "index>=Timestamp('2000-01-01 12:00:00.20+01:00') \ & index) for more tutorials and updates. # # This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.