import addutils.toc ; addutils.toc.js(ipy_notebook=True)
from addutils import css_notebook
css_notebook()
import re
import os
import scipy.io
import numpy as np
import pandas as pd
import sqlite3
import pandas.io.sql as psql
from time import time
from IPython.display import display
from addutils import css_notebook
css_notebook()
import sys
Import from '.mat' files
x = scipy.io.loadmat('example_data/matlab_variable.mat')
pyA = x['a']
pyA
array([[ 92, 99, 1, 8, 15, 67, 74, 51, 58, 40], [ 98, 80, 7, 14, 16, 73, 55, 57, 64, 41], [ 4, 81, 88, 20, 22, 54, 56, 63, 70, 47], [ 85, 87, 19, 21, 3, 60, 62, 69, 71, 28], [ 86, 93, 25, 2, 9, 61, 68, 75, 52, 34], [ 17, 24, 76, 83, 90, 42, 49, 26, 33, 65], [ 23, 5, 82, 89, 91, 48, 30, 32, 39, 66], [ 79, 6, 13, 95, 97, 29, 31, 38, 45, 72], [ 10, 12, 94, 96, 78, 35, 37, 44, 46, 53], [ 11, 18, 100, 77, 84, 36, 43, 50, 27, 59]], dtype=uint8)
The Matlab variable is passed to a pandas DataFrame:
df = pd.DataFrame(pyA)
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 92 | 99 | 1 | 8 | 15 | 67 | 74 | 51 | 58 | 40 |
1 | 98 | 80 | 7 | 14 | 16 | 73 | 55 | 57 | 64 | 41 |
2 | 4 | 81 | 88 | 20 | 22 | 54 | 56 | 63 | 70 | 47 |
3 | 85 | 87 | 19 | 21 | 3 | 60 | 62 | 69 | 71 | 28 |
4 | 86 | 93 | 25 | 2 | 9 | 61 | 68 | 75 | 52 | 34 |
5 | 17 | 24 | 76 | 83 | 90 | 42 | 49 | 26 | 33 | 65 |
6 | 23 | 5 | 82 | 89 | 91 | 48 | 30 | 32 | 39 | 66 |
7 | 79 | 6 | 13 | 95 | 97 | 29 | 31 | 38 | 45 | 72 |
8 | 10 | 12 | 94 | 96 | 78 | 35 | 37 | 44 | 46 | 53 |
9 | 11 | 18 | 100 | 77 | 84 | 36 | 43 | 50 | 27 | 59 |
The following example shows how to import directly a compressed csv file, in this case with multiple separators:
df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\,\;\.',
skipinitialspace=True, compression='gzip', engine='python',
encoding='iso8859_15')
df_csv.head()
ID | Names | Phone | Town | Description | Income | Coordinates | |
---|---|---|---|---|---|---|---|
0 | 1 | Lydia Z. Flores | (02) 0548 5995 | Haddington | sed, sapien. Nunc pulvinar arcu | â¬6,223 | 23.28054, -24.48755 |
1 | 2 | Bell X. Guerra | (07) 8599 9926 | Montes Claros | consectetuer, cursus et, magna. Praesent | â¬14,967 | -77.76329, 69.22339 |
2 | 3 | Kirk Q. Bowman | (06) 4153 7501 | Cagli | tortor. Nunc commodo auctor velit. | â¬17,399 | -58.80037, 22.50537 |
3 | 4 | Karen Pickett | (02) 3216 9708 | Cobourg | at auctor ullamcorper, nisl arcu | â¬17,373 | 75.73982, -78.01872 |
4 | 5 | Vance J. Johnson | (01) 7568 6371 | Carlton | ultricies adipiscing, enim mi tempor | â¬9,025 | -86.51337, 109.46298 |
TODO: utf-8 has problems decoding euro sign. "\u00e2\u0082\u00ac" are the three chars you get when the UTF-8 encoded \u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1.
coor = df_csv['Coordinates']
df_csv['lat'] = ''
df_csv['lon'] = ''
for j, coo in enumerate(coor):
spl = re.split(',', str(coo))
df_csv.loc[j, 'lat'] = spl[0]
df_csv.loc[j, 'lon'] = spl[1]
df_csv.head()
ID | Names | Phone | Town | Description | Income | Coordinates | lat | lon | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | Lydia Z. Flores | (02) 0548 5995 | Haddington | sed, sapien. Nunc pulvinar arcu | â¬6,223 | 23.28054, -24.48755 | 23.28054 | -24.48755 |
1 | 2 | Bell X. Guerra | (07) 8599 9926 | Montes Claros | consectetuer, cursus et, magna. Praesent | â¬14,967 | -77.76329, 69.22339 | -77.76329 | 69.22339 |
2 | 3 | Kirk Q. Bowman | (06) 4153 7501 | Cagli | tortor. Nunc commodo auctor velit. | â¬17,399 | -58.80037, 22.50537 | -58.80037 | 22.50537 |
3 | 4 | Karen Pickett | (02) 3216 9708 | Cobourg | at auctor ullamcorper, nisl arcu | â¬17,373 | 75.73982, -78.01872 | 75.73982 | -78.01872 |
4 | 5 | Vance J. Johnson | (01) 7568 6371 | Carlton | ultricies adipiscing, enim mi tempor | â¬9,025 | -86.51337, 109.46298 | -86.51337 | 109.46298 |
df_csv.to_json('temp/converted_json.json')
with open('temp/converted_json.json') as f:
data = f.read().encode('utf-8')
dfjson = pd.read_json(data)
dfjson.head()
Coordinates | Description | ID | Income | Names | Phone | Town | lat | lon | |
---|---|---|---|---|---|---|---|---|---|
0 | 23.28054, -24.48755 | sed, sapien. Nunc pulvinar arcu | 1 | â¬6,223 | Lydia Z. Flores | (02) 0548 5995 | Haddington | 23.28054 | -24.48755 |
1 | -77.76329, 69.22339 | consectetuer, cursus et, magna. Praesent | 2 | â¬14,967 | Bell X. Guerra | (07) 8599 9926 | Montes Claros | -77.76329 | 69.22339 |
10 | -62.95199, 116.29751 | Duis sit amet diam eu | 11 | â¬15,764 | Jameson H. Craig | (07) 8640 5274 | Fairbanks | -62.95199 | 116.29751 |
11 | -42.26793, -92.23605 | et netus et malesuada fames | 12 | â¬18,460 | Michael Randall | (04) 3451 8606 | Mezzana | -42.26793 | -92.23605 |
12 | -20.39897, 152.4122 | id risus quis diam luctus | 13 | â¬6,175 | Sierra Rivers | (01) 2020 4511 | Lugnano in Teverina | -20.39897 | 152.41220 |
Since the import reordered columns in alphabetical order, we can choose a preferred column order:
dfjson = dfjson.loc[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]
dfjson.head()
ID | Names | Phone | Income | Town | lat | lon | |
---|---|---|---|---|---|---|---|
0 | 1 | Lydia Z. Flores | (02) 0548 5995 | â¬6,223 | Haddington | 23.28054 | -24.48755 |
1 | 2 | Bell X. Guerra | (07) 8599 9926 | â¬14,967 | Montes Claros | -77.76329 | 69.22339 |
10 | 11 | Jameson H. Craig | (07) 8640 5274 | â¬15,764 | Fairbanks | -62.95199 | 116.29751 |
11 | 12 | Michael Randall | (04) 3451 8606 | â¬18,460 | Mezzana | -42.26793 | -92.23605 |
12 | 13 | Sierra Rivers | (01) 2020 4511 | â¬6,175 | Lugnano in Teverina | -20.39897 | 152.41220 |
Note: read_html
returns a list of DataFrame objects, even if there is only a single table contained in the HTML
content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)
dfhtml = pd.read_html('example_data/generated.html', header=0)
dfhtml[0].head()
Nomi maschili | Strade | Città | Coordinate | Ente | ||
---|---|---|---|---|---|---|
0 | Drew | 5672 Accumsan Road | Whitburn | -54.68429, -67.21709 | Ac Consulting | a.aliquet.vel@vitae.co.uk |
1 | Finn | Ap #170-4074 Interdum Rd. | Pelotas | 81.93087, 168.14556 | Varius Incorporated | nec.malesuada.ut@primisin.com |
2 | Martin | Ap #471-6260 Etiam Ave | Aliano | 33.99272, -148.70584 | Arcu Industries | lacus@risus.net |
3 | Zephania | Ap #720-7951 Aliquam Ave | Senneville | -56.82569, -72.80435 | Tristique PC | arcu@sempereratin.co.uk |
4 | Devin | 3538 Dui St. | Zwettl-Niederösterreich | -11.90874, -149.61123 | Imperdiet Dictum PC | lectus.rutrum.urna@massaQuisqueporttitor.ca |
dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')
dfxl.head()
Names | Streets | Town | Coordinates | Corporation | ||
---|---|---|---|---|---|---|
0 | Stephen | Ap #389-3365 Risus, St. | Lelystad | 51.91783, -47.01037 | Consectetuer Rhoncus Nullam Corporation | Suspendisse.sed@pedePraesenteu.org |
1 | Kenneth | 120-6483 Ligula. Ave | Labrecque | -1.92625, 10.02451 | A Corporation | orci@egetmollislectus.com |
2 | Leo | Ap #727-2085 Eget Av. | Strathcona County | -17.31839, 137.99307 | Sed Limited | Etiam@vel.org |
3 | Joshua | P.O. Box 425, 6462 Arcu Rd. | Municipal District | -51.34642, 80.32145 | Mauris Sapien Cursus Corp. | lorem@ornarelectusante.net |
4 | Zephania | 655 Et, St. | Couillet | 29.96525, 124.18391 | Odio Semper Cursus Corp. | metus@a.ca |
Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:
con = sqlite3.connect("temp.sql")
sqlfile = dfxl.to_sql('Names', con)
con = sqlite3.connect('temp.sql')
with con:
sql = "SELECT * FROM Names;"
df = psql.read_sql(sql, con)
print(df.shape)
(100, 7)
df.head()
index | Names | Streets | Town | Coordinates | Corporation | ||
---|---|---|---|---|---|---|---|
0 | 0 | Stephen | Ap #389-3365 Risus, St. | Lelystad | 51.91783, -47.01037 | Consectetuer Rhoncus Nullam Corporation | Suspendisse.sed@pedePraesenteu.org |
1 | 1 | Kenneth | 120-6483 Ligula. Ave | Labrecque | -1.92625, 10.02451 | A Corporation | orci@egetmollislectus.com |
2 | 2 | Leo | Ap #727-2085 Eget Av. | Strathcona County | -17.31839, 137.99307 | Sed Limited | Etiam@vel.org |
3 | 3 | Joshua | P.O. Box 425, 6462 Arcu Rd. | Municipal District | -51.34642, 80.32145 | Mauris Sapien Cursus Corp. | lorem@ornarelectusante.net |
4 | 4 | Zephania | 655 Et, St. | Couillet | 29.96525, 124.18391 | Odio Semper Cursus Corp. | metus@a.ca |
con.close()
os.remove("temp.sql")
HDFStore is a dict-like object used by pandas to store datasets as HDF5 files using the PyTables library. HDF5 is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The Storer format stores fixed arrays, which are queryiable and must be retrieved in their entirety.
Add DataFrames to the HDFStore object:
samples_01 = int(3e5)
samples_02 = int(1e5)
idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')
idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')
randn = np.random.randn
df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])
df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])
print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB')
Size of the Dataset: 13.73291015625 MB
%%timeit
with pd.HDFStore('temp/store53.h5') as store:
store.put('storer/df1', df1)
store.put('storer/df2', df2)
store.put('to_remove', df2)
20.4 ms ± 612 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Retrieve stored objects:
with pd.HDFStore('temp/store53.h5') as store:
print (store.keys())
df1_retrieved = store.get('storer/df1')
print (df1_retrieved[1:3])
print (df1[1:3])
print ('Check retrieved data equal to original data: ')
print (df1_retrieved[1:3].equals(df1[1:3]))
['/', '/to_remove', '/storer/df1', '/storer/df2', '/table/df1_appended'] A B C 2000-01-01 12:00:00.050000+01:00 -1.224140 0.058376 2.181400 2000-01-01 12:00:00.100000+01:00 0.461911 0.018024 -0.165891 A B C 2000-01-01 12:00:00.050000+01:00 -1.224140 0.058376 2.181400 2000-01-01 12:00:00.100000+01:00 0.461911 0.018024 -0.165891 Check retrieved data equal to original data: True
Delete objects:
with pd.HDFStore('temp/store53.h5') as store:
try:
store.remove('to_remove')
except:
pass
print (store)
<class 'pandas.io.pytables.HDFStore'> File path: temp/store53.h5
The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported.
with pd.HDFStore('temp/store53.h5') as store:
# store.append creates a table automatically:
store.append('table/df1_appended', df1.iloc[:10000])
with pd.HDFStore('temp/store53.h5') as store:
store.append('table/df1_appended', df1.iloc[10001:20000])
store.append('table/df1_appended', df1.iloc[20001:50000])
Query the table using boolean expression with in-line function evaluation:
with pd.HDFStore('temp/store53.h5') as store:
query01 = store.select('table/df1_appended',
"index>=Timestamp('2000-01-01 12:00:00.20+01:00') \
& index<Timestamp('2000-01-01 12:00:00.40+01:00') \
& columns=['A', 'B']")
query01
A | B | |
---|---|---|
2000-01-01 12:00:00.200000+01:00 | -0.516980 | 0.231047 |
2000-01-01 12:00:00.250000+01:00 | 0.151773 | 1.263926 |
2000-01-01 12:00:00.300000+01:00 | -0.221863 | 2.628103 |
2000-01-01 12:00:00.350000+01:00 | 0.454621 | -0.378720 |
2000-01-01 12:00:00.200000+01:00 | -3.104964 | -1.301010 |
2000-01-01 12:00:00.250000+01:00 | 0.883893 | 0.319630 |
2000-01-01 12:00:00.300000+01:00 | 0.780367 | -0.921197 |
2000-01-01 12:00:00.350000+01:00 | 1.117064 | -0.678611 |
2000-01-01 12:00:00.200000+01:00 | 1.091494 | -1.163870 |
2000-01-01 12:00:00.250000+01:00 | -0.131051 | -1.156213 |
2000-01-01 12:00:00.300000+01:00 | 0.773293 | -1.646943 |
2000-01-01 12:00:00.350000+01:00 | -0.293174 | 0.268033 |
2000-01-01 12:00:00.200000+01:00 | 0.209013 | 0.178377 |
2000-01-01 12:00:00.250000+01:00 | 0.714508 | -0.081073 |
2000-01-01 12:00:00.300000+01:00 | -0.115000 | -1.940541 |
2000-01-01 12:00:00.350000+01:00 | -0.317933 | 0.245165 |
2000-01-01 12:00:00.200000+01:00 | 1.088952 | -0.470424 |
2000-01-01 12:00:00.250000+01:00 | -1.921369 | -1.620379 |
2000-01-01 12:00:00.300000+01:00 | 0.799032 | 0.974941 |
2000-01-01 12:00:00.350000+01:00 | 0.839663 | 0.301037 |
Visit www.add-for.com for more tutorials and updates.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.