import addutils.toc ; addutils.toc.js(ipy_notebook=True)
from addutils import css_notebook
css_notebook()
import re
import os
import scipy.io
import numpy as np
import pandas as pd
import sqlite3
#from pandas.io.data import DataFrame, read_csv
import pandas.io.sql as psql
from time import time
from IPython.display import display
from addutils import css_notebook
css_notebook()
import sys
Import from '.mat' files
x = scipy.io.loadmat('example_data/matlab_variable.mat')
pyA = x['a']
pyA
array([[ 92, 99, 1, 8, 15, 67, 74, 51, 58, 40], [ 98, 80, 7, 14, 16, 73, 55, 57, 64, 41], [ 4, 81, 88, 20, 22, 54, 56, 63, 70, 47], [ 85, 87, 19, 21, 3, 60, 62, 69, 71, 28], [ 86, 93, 25, 2, 9, 61, 68, 75, 52, 34], [ 17, 24, 76, 83, 90, 42, 49, 26, 33, 65], [ 23, 5, 82, 89, 91, 48, 30, 32, 39, 66], [ 79, 6, 13, 95, 97, 29, 31, 38, 45, 72], [ 10, 12, 94, 96, 78, 35, 37, 44, 46, 53], [ 11, 18, 100, 77, 84, 36, 43, 50, 27, 59]], dtype=uint8)
The Matlab variable is passed to a pandas DataFrame:
df = pd.DataFrame(pyA)
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 92 | 99 | 1 | 8 | 15 | 67 | 74 | 51 | 58 | 40 |
1 | 98 | 80 | 7 | 14 | 16 | 73 | 55 | 57 | 64 | 41 |
2 | 4 | 81 | 88 | 20 | 22 | 54 | 56 | 63 | 70 | 47 |
3 | 85 | 87 | 19 | 21 | 3 | 60 | 62 | 69 | 71 | 28 |
4 | 86 | 93 | 25 | 2 | 9 | 61 | 68 | 75 | 52 | 34 |
5 | 17 | 24 | 76 | 83 | 90 | 42 | 49 | 26 | 33 | 65 |
6 | 23 | 5 | 82 | 89 | 91 | 48 | 30 | 32 | 39 | 66 |
7 | 79 | 6 | 13 | 95 | 97 | 29 | 31 | 38 | 45 | 72 |
8 | 10 | 12 | 94 | 96 | 78 | 35 | 37 | 44 | 46 | 53 |
9 | 11 | 18 | 100 | 77 | 84 | 36 | 43 | 50 | 27 | 59 |
The following example shows how to import directly a compressed csv file, in this case with multiple separators:
df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\,\;\.',
skipinitialspace=True, compression='gzip', engine='python',
encoding='iso8859_15')
df_csv.head()
ID | Names | Phone | Town | Description | Income | Coordinates | |
---|---|---|---|---|---|---|---|
0 | 1 | Lydia Z. Flores | (02) 0548 5995 | Haddington | sed, sapien. Nunc pulvinar arcu | â¬6,223 | 23.28054, -24.48755 |
1 | 2 | Bell X. Guerra | (07) 8599 9926 | Montes Claros | consectetuer, cursus et, magna. Praesent | â¬14,967 | -77.76329, 69.22339 |
2 | 3 | Kirk Q. Bowman | (06) 4153 7501 | Cagli | tortor. Nunc commodo auctor velit. | â¬17,399 | -58.80037, 22.50537 |
3 | 4 | Karen Pickett | (02) 3216 9708 | Cobourg | at auctor ullamcorper, nisl arcu | â¬17,373 | 75.73982, -78.01872 |
4 | 5 | Vance J. Johnson | (01) 7568 6371 | Carlton | ultricies adipiscing, enim mi tempor | â¬9,025 | -86.51337, 109.46298 |
TODO: utf-8 has problems decoding euro sign. "\u00e2\u0082\u00ac" are the three chars you get when the UTF-8 encoded \u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1.
coor = df_csv['Coordinates']
df_csv['lat'] = ''
df_csv['lon'] = ''
for j, coo in enumerate(coor):
spl = re.split(',', str(coo))
df_csv.loc[j, 'lat'] = spl[0]
df_csv.loc[j, 'lon'] = spl[1]
df_csv.head()
ID | Names | Phone | Town | Description | Income | Coordinates | lat | lon | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | Lydia Z. Flores | (02) 0548 5995 | Haddington | sed, sapien. Nunc pulvinar arcu | â¬6,223 | 23.28054, -24.48755 | 23.28054 | -24.48755 |
1 | 2 | Bell X. Guerra | (07) 8599 9926 | Montes Claros | consectetuer, cursus et, magna. Praesent | â¬14,967 | -77.76329, 69.22339 | -77.76329 | 69.22339 |
2 | 3 | Kirk Q. Bowman | (06) 4153 7501 | Cagli | tortor. Nunc commodo auctor velit. | â¬17,399 | -58.80037, 22.50537 | -58.80037 | 22.50537 |
3 | 4 | Karen Pickett | (02) 3216 9708 | Cobourg | at auctor ullamcorper, nisl arcu | â¬17,373 | 75.73982, -78.01872 | 75.73982 | -78.01872 |
4 | 5 | Vance J. Johnson | (01) 7568 6371 | Carlton | ultricies adipiscing, enim mi tempor | â¬9,025 | -86.51337, 109.46298 | -86.51337 | 109.46298 |
df_csv.to_json('temp/converted_json.json')
with open('temp/converted_json.json') as f:
data = f.read().encode('utf-8')
dfjson = pd.read_json(data)
dfjson.head()
Coordinates | Description | ID | Income | Names | Phone | Town | lat | lon | |
---|---|---|---|---|---|---|---|---|---|
0 | 23.28054, -24.48755 | sed, sapien. Nunc pulvinar arcu | 1 | â¬6,223 | Lydia Z. Flores | (02) 0548 5995 | Haddington | 23.28054 | -24.48755 |
1 | -77.76329, 69.22339 | consectetuer, cursus et, magna. Praesent | 2 | â¬14,967 | Bell X. Guerra | (07) 8599 9926 | Montes Claros | -77.76329 | 69.22339 |
10 | -62.95199, 116.29751 | Duis sit amet diam eu | 11 | â¬15,764 | Jameson H. Craig | (07) 8640 5274 | Fairbanks | -62.95199 | 116.29751 |
11 | -42.26793, -92.23605 | et netus et malesuada fames | 12 | â¬18,460 | Michael Randall | (04) 3451 8606 | Mezzana | -42.26793 | -92.23605 |
12 | -20.39897, 152.4122 | id risus quis diam luctus | 13 | â¬6,175 | Sierra Rivers | (01) 2020 4511 | Lugnano in Teverina | -20.39897 | 152.41220 |
Since the import reordered columns in alphabetical order, we can choose a preferred column order:
dfjson = dfjson.ix[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]
dfjson.head()
ID | Names | Phone | Income | Town | lat | lon | |
---|---|---|---|---|---|---|---|
0 | 1 | Lydia Z. Flores | (02) 0548 5995 | â¬6,223 | Haddington | 23.28054 | -24.48755 |
1 | 2 | Bell X. Guerra | (07) 8599 9926 | â¬14,967 | Montes Claros | -77.76329 | 69.22339 |
10 | 11 | Jameson H. Craig | (07) 8640 5274 | â¬15,764 | Fairbanks | -62.95199 | 116.29751 |
11 | 12 | Michael Randall | (04) 3451 8606 | â¬18,460 | Mezzana | -42.26793 | -92.23605 |
12 | 13 | Sierra Rivers | (01) 2020 4511 | â¬6,175 | Lugnano in Teverina | -20.39897 | 152.41220 |
Note: read_html
returns a list of DataFrame objects, even if there is only a single table contained in the HTML
content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)
dfhtml = pd.read_html('example_data/generated.html', header=0)
dfhtml[0].head()
Nomi maschili | Strade | Città | Coordinate | Ente | ||
---|---|---|---|---|---|---|
0 | Drew | 5672 Accumsan Road | Whitburn | -54.68429, -67.21709 | Ac Consulting | a.aliquet.vel@vitae.co.uk |
1 | Finn | Ap #170-4074 Interdum Rd. | Pelotas | 81.93087, 168.14556 | Varius Incorporated | nec.malesuada.ut@primisin.com |
2 | Martin | Ap #471-6260 Etiam Ave | Aliano | 33.99272, -148.70584 | Arcu Industries | lacus@risus.net |
3 | Zephania | Ap #720-7951 Aliquam Ave | Senneville | -56.82569, -72.80435 | Tristique PC | arcu@sempereratin.co.uk |
4 | Devin | 3538 Dui St. | Zwettl-Niederösterreich | -11.90874, -149.61123 | Imperdiet Dictum PC | lectus.rutrum.urna@massaQuisqueporttitor.ca |
dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')
dfxl.head()
Names | Streets | Town | Coordinates | Corporation | ||
---|---|---|---|---|---|---|
0 | Stephen | Ap #389-3365 Risus, St. | Lelystad | 51.91783, -47.01037 | Consectetuer Rhoncus Nullam Corporation | Suspendisse.sed@pedePraesenteu.org |
1 | Kenneth | 120-6483 Ligula. Ave | Labrecque | -1.92625, 10.02451 | A Corporation | orci@egetmollislectus.com |
2 | Leo | Ap #727-2085 Eget Av. | Strathcona County | -17.31839, 137.99307 | Sed Limited | Etiam@vel.org |
3 | Joshua | P.O. Box 425, 6462 Arcu Rd. | Municipal District | -51.34642, 80.32145 | Mauris Sapien Cursus Corp. | lorem@ornarelectusante.net |
4 | Zephania | 655 Et, St. | Couillet | 29.96525, 124.18391 | Odio Semper Cursus Corp. | metus@a.ca |
Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:
con = sqlite3.connect("temp.sql")
sqlfile = dfxl.to_sql('Names', con, flavor='sqlite')
con = sqlite3.connect('temp.sql')
with con:
sql = "SELECT * FROM Names;"
df = psql.read_sql(sql, con)
print(df.shape)
(100, 7)
df.head()
index | Names | Streets | Town | Coordinates | Corporation | ||
---|---|---|---|---|---|---|---|
0 | 0 | Stephen | Ap #389-3365 Risus, St. | Lelystad | 51.91783, -47.01037 | Consectetuer Rhoncus Nullam Corporation | Suspendisse.sed@pedePraesenteu.org |
1 | 1 | Kenneth | 120-6483 Ligula. Ave | Labrecque | -1.92625, 10.02451 | A Corporation | orci@egetmollislectus.com |
2 | 2 | Leo | Ap #727-2085 Eget Av. | Strathcona County | -17.31839, 137.99307 | Sed Limited | Etiam@vel.org |
3 | 3 | Joshua | P.O. Box 425, 6462 Arcu Rd. | Municipal District | -51.34642, 80.32145 | Mauris Sapien Cursus Corp. | lorem@ornarelectusante.net |
4 | 4 | Zephania | 655 Et, St. | Couillet | 29.96525, 124.18391 | Odio Semper Cursus Corp. | metus@a.ca |
con.close()
os.remove("temp.sql")
HDFStore is a dict-like object used by pandas to store datasets as HDF5 files using the PyTables library. HDF5 is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The Storer format stores fixed arrays, which are queryiable and must be retrieved in their entirety.
Add DataFrames to the HDFStore object:
samples_01 = int(3e5)
samples_02 = int(1e5)
idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')
idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')
randn = np.random.randn
df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])
df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])
print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB')
Size of the Dataset: 13.73291015625 MB
%%timeit
with pd.get_store('temp/store53.h5') as store:
store.put('storer/df1', df1)
store.put('storer/df2', df2)
store.put('to_remove', df2)
10 loops, best of 3: 32 ms per loop
Retrieve stored objects:
with pd.get_store('temp/store53.h5') as store:
print (store.keys())
df1_retrieved = store.get('storer/df1')
print (df1_retrieved[1:3])
print (df1[1:3])
print ('Check retrieved data equal to original data: ')
print (df1_retrieved[1:3].equals(df1[1:3]))
['/to_remove', '/storer/df1', '/storer/df2'] A B C 2000-01-01 12:00:00.050000+01:00 -0.022728 0.015469 0.962709 2000-01-01 12:00:00.100000+01:00 -0.453089 0.513491 -1.270842 A B C 2000-01-01 12:00:00.050000+01:00 -0.022728 0.015469 0.962709 2000-01-01 12:00:00.100000+01:00 -0.453089 0.513491 -1.270842 Check retrieved data equal to original data: True
Delete objects:
with pd.get_store('temp/store53.h5') as store:
try:
store.remove('to_remove')
except:
pass
print (store)
<class 'pandas.io.pytables.HDFStore'> File path: temp/store53.h5 /storer/df1 frame (shape->[300000,3]) /storer/df2 frame (shape->[100000,4])
The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported.
with pd.get_store('temp/store53.h5') as store:
# store.append creates a table automatically:
store.append('table/df1_appended', df1.ix[:10000])
with pd.get_store('temp/store53.h5') as store:
store.append('table/df1_appended', df1.ix[10001:20000])
store.append('table/df1_appended', df1.ix[20001:50000])
Query the table using boolean expression with in-line function evaluation:
with pd.get_store('temp/store53.h5') as store:
query01 = store.select('table/df1_appended',
"index>=Timestamp('2000-01-01 12:00:00.20+01:00') \
& index<Timestamp('2000-01-01 12:00:00.40+01:00') \
& columns=['A', 'B']")
query01
A | B | |
---|---|---|
2000-01-01 12:00:00.200000+01:00 | -0.232597 | -0.705741 |
2000-01-01 12:00:00.250000+01:00 | 0.183234 | 0.024464 |
2000-01-01 12:00:00.300000+01:00 | -0.801220 | -0.732630 |
2000-01-01 12:00:00.350000+01:00 | 0.762667 | -0.585462 |
Visit www.add-for.com for more tutorials and updates.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.