Robert Johansson
Source code listings for Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib (ISBN 978-1-484242-45-2).
from __future__ import print_function
import numpy as np
np.random.seed(0)
import pandas as pd
import csv
import json
import h5py
import tables
import pickle
# import cPickle
import msgpack
%%writefile playerstats-2013-2014.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
Overwriting playerstats-2013-2014.csv
%%writefile playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
6,Phil Kessel,TOR,R,82,37,43,80,-5,27,8,20,0,0,6,0,305,12.1,20:39,24.5,14.3
7,Taylor Hall,EDM,L,75,27,53,80,-15,44,7,17,0,1,1,1,250,10.8,20:00,25.4,45.7
8,Alex Ovechkin,WSH,L,78,51,28,79,-35,48,24,39,0,1,10,3,386,13.2,20:32,21.8,66.7
9,Joe Pavelski,SJS,C,82,41,38,79,+23,32,16,31,1,2,3,0,225,18.2,19:51,27.1,56.0
10,Jamie Benn,DAL,L,81,34,45,79,+21,64,5,19,1,3,3,1,279,12.2,19:09,25.0,52.8
11,Nicklas Backstrom,WSH,C,82,18,61,79,-20,54,6,44,1,1,1,0,196,9.2,19:48,23.3,50.4
12,Patrick Sharp,CHI,L,82,34,44,78,+13,40,10,25,0,0,3,1,313,10.9,18:53,22.7,54.6
13,Joe Thornton,SJS,C,82,11,65,76,+20,32,2,19,0,1,3,1,122,9.0,18:55,26.3,56.1
14,Erik Karlsson,OTT,D,82,20,54,74,-15,36,5,31,0,0,1,0,257,7.8,27:04,28.6,0.0
15,Evgeni Malkin,PIT,C,60,23,49,72,+10,62,7,30,0,0,3,0,191,12.0,20:03,21.4,48.8
16,Patrick Marleau,SJS,L,82,33,37,70,+0,18,11,23,2,2,4,0,285,11.6,20:31,27.3,52.9
17,Anze Kopitar,LAK,C,82,29,41,70,+34,24,10,23,0,0,9,2,200,14.5,20:53,25.4,53.3
18,Matt Duchene,COL,C,71,23,47,70,+8,19,5,17,0,0,6,1,217,10.6,18:29,22.0,50.3
19,Martin St. Louis,"TBL, NYR",R,81,30,39,69,+13,10,9,21,1,2,5,1,204,14.7,20:56,25.7,40.7
20,Patrick Kane,CHI,R,69,29,40,69,+7,22,10,25,0,0,6,0,227,12.8,19:36,22.9,50.0
21,Blake Wheeler,WPG,R,82,28,41,69,+4,63,8,19,0,0,4,2,225,12.4,18:41,24.0,37.5
22,Kyle Okposo,NYI,R,71,27,42,69,-9,51,5,15,0,0,4,1,195,13.8,20:26,22.2,47.5
23,David Krejci,BOS,C,80,19,50,69,+39,28,3,19,0,0,6,1,169,11.2,19:07,21.3,51.2
24,Chris Kunitz,PIT,L,78,35,33,68,+25,66,13,22,0,0,8,0,218,16.1,19:09,22.2,75.0
25,Jonathan Toews,CHI,C,76,28,40,68,+26,34,5,15,3,5,5,0,193,14.5,20:28,25.9,57.2
26,Thomas Vanek,"BUF, NYI, MTL",L,78,27,41,68,+7,46,8,18,0,0,4,0,248,10.9,19:21,21.6,43.5
27,Jaromir Jagr,NJD,R,82,24,43,67,+16,46,5,17,0,0,6,1,231,10.4,19:09,22.8,0.0
28,John Tavares,NYI,C,59,24,42,66,-6,40,8,25,0,0,4,0,188,12.8,21:14,22.3,49.1
29,Jason Spezza,OTT,C,75,23,43,66,-26,46,9,22,0,0,5,0,223,10.3,18:12,23.8,54.0
30,Jordan Eberle,EDM,R,80,28,37,65,-11,18,7,20,1,1,4,1,200,14.0,19:32,25.4,38.1
Overwriting playerstats-2013-2014-top30.csv
!head -n 5 playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO% 1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5 2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0 3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
rows = []
with open("playerstats-2013-2014.csv") as f:
csvreader = csv.reader(f)
rows = [fields for fields in csvreader]
rows[1][1:6]
['Player', 'Team', 'Pos', 'GP', 'G']
rows[2][1:6]
['Sidney Crosby', 'PIT', 'C', '80', '36']
data = np.random.randn(100, 3)
np.savetxt("data.csv", data, delimiter=",", header="x, y, z", comments="# Random x, y, z coordinates\n")
!head -n 5 data.csv
# Random x, y, z coordinates x, y, z 1.764052345967664026e+00,4.001572083672232938e-01,9.787379841057392005e-01 2.240893199201457797e+00,1.867557990149967484e+00,-9.772778798764110153e-01 9.500884175255893682e-01,-1.513572082976978872e-01,-1.032188517935578448e-01
data_load = np.loadtxt("data.csv", skiprows=2, delimiter=",")
data_load[1,:]
array([ 2.2408932 , 1.86755799, -0.97727788])
data_load.dtype
dtype('float64')
(data == data_load).all()
True
data = np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", dtype=bytes)
data[0][1:6]
array([b'Sidney Crosby', b'PIT', b'C', b'80', b'36'], dtype='|S13')
np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", usecols=[6,7,8])
array([[ 68., 104., 18.], [ 56., 87., 28.], [ 58., 86., 7.], [ 47., 84., 16.], [ 39., 82., 32.]])
df = pd.read_csv("playerstats-2013-2014.csv", skiprows=1)
df = df.set_index("Rank")
df[["Player", "GP", "G", "A", "P"]]
Player | GP | G | A | P | |
---|---|---|---|---|---|
Rank | |||||
1 | Sidney Crosby | 80 | 36 | 68 | 104 |
2 | Ryan Getzlaf | 77 | 31 | 56 | 87 |
3 | Claude Giroux | 82 | 28 | 58 | 86 |
4 | Tyler Seguin | 80 | 37 | 47 | 84 |
5 | Corey Perry | 81 | 43 | 39 | 82 |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5 entries, 1 to 5 Data columns (total 20 columns): Player 5 non-null object Team 5 non-null object Pos 5 non-null object GP 5 non-null int64 G 5 non-null int64 A 5 non-null int64 P 5 non-null int64 +/- 5 non-null int64 PIM 5 non-null int64 PPG 5 non-null int64 PPP 5 non-null int64 SHG 5 non-null int64 SHP 5 non-null int64 GW 5 non-null int64 OT 5 non-null int64 S 5 non-null int64 S% 5 non-null float64 TOI/GP 5 non-null object Shift/GP 5 non-null float64 FO% 5 non-null float64 dtypes: float64(3), int64(13), object(4) memory usage: 840.0+ bytes
df[["Player", "GP", "G", "A", "P"]].to_csv("playerstats-2013-2014-subset.csv")
!head -n 5 playerstats-2013-2014-subset.csv
Rank,Player,GP,G,A,P 1,Sidney Crosby,80,36,68,104 2,Ryan Getzlaf,77,31,56,87 3,Claude Giroux,82,28,58,86 4,Tyler Seguin,80,37,47,84
import h5py
# mode = "w", "r", "w-", "r+", "a"
f = h5py.File("data.h5", "w")
f.mode
'r+'
f.flush()
f.close()
f = h5py.File("data.h5", "w")
f.name
'/'
grp1 = f.create_group("experiment1")
grp1.name
'/experiment1'
grp2_meas = f.create_group("experiment2/measurement")
grp2_meas.name
'/experiment2/measurement'
grp2_sim = f.create_group("experiment2/simulation")
grp2_sim.name
'/experiment2/simulation'
f["/experiment1"]
<HDF5 group "/experiment1" (0 members)>
f["/experiment2/simulation"]
<HDF5 group "/experiment2/simulation" (0 members)>
grp_expr2 = f["/experiment2"]
grp_expr2['simulation']
<HDF5 group "/experiment2/simulation" (0 members)>
list(f.keys())
['experiment1', 'experiment2']
list(f.items())
[('experiment1', <HDF5 group "/experiment1" (0 members)>), ('experiment2', <HDF5 group "/experiment2" (2 members)>)]
f.visit(lambda x: print(x))
experiment1 experiment2 experiment2/measurement experiment2/simulation
f.visititems(lambda name, value: print(name, value))
experiment1 <HDF5 group "/experiment1" (0 members)> experiment2 <HDF5 group "/experiment2" (2 members)> experiment2/measurement <HDF5 group "/experiment2/measurement" (0 members)> experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>
"experiment1" in f
True
"simulation" in f["experiment2"]
True
"experiment3" in f
False
f.flush()
!h5ls -r data.h5
/ Group /experiment1 Group /experiment2 Group /experiment2/measurement Group /experiment2/simulation Group
data1 = np.arange(10)
data2 = np.random.randn(100, 100)
f["array1"] = data1
f["/experiment2/measurement/meas1"] = data2
f.visititems(lambda name, value: print(name, value))
array1 <HDF5 dataset "array1": shape (10,), type "<i8"> experiment1 <HDF5 group "/experiment1" (0 members)> experiment2 <HDF5 group "/experiment2" (2 members)> experiment2/measurement <HDF5 group "/experiment2/measurement" (1 members)> experiment2/measurement/meas1 <HDF5 dataset "meas1": shape (100, 100), type "<f8"> experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>
ds = f["array1"]
ds
<HDF5 dataset "array1": shape (10,), type "<i8">
ds.name
'/array1'
ds.dtype
dtype('int64')
ds.shape
(10,)
ds.len()
10
ds.value
/Users/rob/miniconda3/envs/py3.6/lib/python3.6/site-packages/h5py/_hl/dataset.py:313: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead. "Use dataset[()] instead.", H5pyDeprecationWarning)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
ds = f["/experiment2/measurement/meas1"]
ds
<HDF5 dataset "meas1": shape (100, 100), type "<f8">
ds.dtype
dtype('<f8')
ds.shape
(100, 100)
data_full = ds[...]
type(data_full)
numpy.ndarray
data_full.shape
(100, 100)
data_col = ds[:, 0]
data_col.shape
(100,)
ds[10:20:3, 10:20:3]
array([[ 0.60270766, -0.34804638, -0.813596 , -1.29737966], [ 0.91320192, -1.06343294, 0.22734595, 0.52759738], [ 1.25774422, -0.32775492, 1.4849256 , 0.28005786], [-0.84907287, -0.30000358, 1.79691852, -0.19871506]])
ds[[1,2,3], :].shape
(3, 100)
ds[[1,2,3], :].shape
(3, 100)
mask = ds[:, 0] > 2.0
mask.shape, mask.dtype
((100,), dtype('bool'))
ds[mask, 0]
array([2.04253623, 2.1041854 , 2.05689385])
ds[mask, :5]
array([[ 2.04253623, -0.91946118, 0.11467003, -0.1374237 , 1.36552692], [ 2.1041854 , 0.22725706, -1.1291663 , -0.28133197, -0.7394167 ], [ 2.05689385, 0.18041971, -0.06670925, -0.02835398, 0.48480475]])
# create empty data sets, assign and update datasets
ds = f.create_dataset("array2", data=np.random.randint(10, size=10))
ds
<HDF5 dataset "array2": shape (10,), type "<i8">
ds.value
array([0, 2, 2, 4, 7, 3, 7, 2, 4, 1])
ds = f.create_dataset("/experiment2/simulation/data1", shape=(5, 5), fillvalue=-1)
ds
<HDF5 dataset "data1": shape (5, 5), type "<f4">
ds.value
array([[-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.]], dtype=float32)
ds = f.create_dataset("/experiment1/simulation/data1", shape=(5000, 5000, 5000),
fillvalue=0, compression='gzip')
ds
<HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">
ds[:, 0, 0] = np.random.rand(5000)
ds[1, :, 0] += np.random.rand(5000)
ds[:2, :5, 0]
array([[0.6939344 , 0. , 0. , 0. , 0. ], [1.4819994 , 0.01639538, 0.54387355, 0.11130908, 0.9928771 ]], dtype=float32)
ds.fillvalue
0.0
f["experiment1"].visititems(lambda name, value: print(name, value))
simulation <HDF5 group "/experiment1/simulation" (1 members)> simulation/data1 <HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">
float(np.prod(ds.shape) * ds[0,0,0].nbytes) / (1024**3) # Gb
465.66128730773926
f.flush()
f.filename
'data.h5'
!ls -lh data.h5
-rw-r--r-- 1 rob staff 357K May 6 16:11 data.h5
del f["/experiment1/simulation/data1"]
f["experiment1"].visititems(lambda name, value: print(name, value))
simulation <HDF5 group "/experiment1/simulation" (0 members)>
f.close()
# attributes
f = h5py.File("data.h5")
f.attrs
<Attributes of HDF5 object at 4768620880>
f.attrs["desc"] = "Result sets from experiments and simulations"
f["experiment1"].attrs["date"] = "2015-1-1"
f["experiment2"].attrs["date"] = "2015-1-2"
f["experiment2/simulation/data1"].attrs["k"] = 1.5
f["experiment2/simulation/data1"].attrs["T"] = 1000
list(f["experiment1"].attrs.keys())
['date']
list(f["experiment2/simulation/data1"].attrs.items())
[('T', 1000), ('k', 1.5)]
"T" in f["experiment2/simulation/data1"].attrs
True
del f["experiment2/simulation/data1"].attrs["T"]
"T" in f["experiment2/simulation/data1"].attrs
False
f["experiment2/simulation/data1"].attrs["t"] = np.array([1, 2, 3])
f["experiment2/simulation/data1"].attrs["t"]
array([1, 2, 3])
f.close()
df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)
df = df.set_index("Rank")
df[["Player", "Pos", "GP", "P", "G", "A", "S%", "Shift/GP"]].head(5)
Player | Pos | GP | P | G | A | S% | Shift/GP | |
---|---|---|---|---|---|---|---|---|
Rank | ||||||||
1 | Sidney Crosby | C | 80 | 104 | 36 | 68 | 13.9 | 24.0 |
2 | Ryan Getzlaf | C | 77 | 87 | 31 | 56 | 15.2 | 25.2 |
3 | Claude Giroux | C | 82 | 86 | 28 | 58 | 12.6 | 25.1 |
4 | Tyler Seguin | C | 80 | 84 | 37 | 47 | 12.6 | 23.4 |
5 | Corey Perry | R | 81 | 82 | 43 | 39 | 15.4 | 23.2 |
f = tables.open_file("playerstats-2013-2014.h5", mode="w")
grp = f.create_group("/", "season_2013_2014", title="NHL player statistics for the 2013/2014 season")
grp
/season_2013_2014 (Group) 'NHL player statistics for the 2013/2014 season' children := []
f.root
/ (RootGroup) '' children := ['season_2013_2014' (Group)]
class PlayerStat(tables.IsDescription):
player = tables.StringCol(20, dflt="")
position = tables.StringCol(1, dflt="C")
games_played = tables.UInt8Col(dflt=0)
points = tables.UInt16Col(dflt=0)
goals = tables.UInt16Col(dflt=0)
assists = tables.UInt16Col(dflt=0)
shooting_percentage = tables.Float64Col(dflt=0.0)
shifts_per_game_played = tables.Float64Col(dflt=0.0)
top30_table = f.create_table(grp, 'top30', PlayerStat, "Top 30 point leaders")
playerstat = top30_table.row
type(playerstat)
tables.tableextension.Row
for index, row_series in df.iterrows():
playerstat["player"] = row_series["Player"]
playerstat["position"] = row_series["Pos"]
playerstat["games_played"] = row_series["GP"]
playerstat["points"] = row_series["P"]
playerstat["goals"] = row_series["G"]
playerstat["assists"] = row_series["A"]
playerstat["shooting_percentage"] = row_series["S%"]
playerstat["shifts_per_game_played"] = row_series["Shift/GP"]
playerstat.append()
top30_table.flush()
top30_table.cols.player[:5]
array([b'Sidney Crosby', b'Ryan Getzlaf', b'Claude Giroux', b'Tyler Seguin', b'Corey Perry'], dtype='|S20')
top30_table.cols.points[:5]
array([104, 87, 86, 84, 82], dtype=uint16)
def print_playerstat(row):
print("%20s\t%s\t%s\t%s" %
(row["player"].decode('UTF-8'), row["points"], row["goals"], row["assists"]))
for row in top30_table.iterrows():
print_playerstat(row)
Sidney Crosby 104 36 68 Ryan Getzlaf 87 31 56 Claude Giroux 86 28 58 Tyler Seguin 84 37 47 Corey Perry 82 43 39 Phil Kessel 80 37 43 Taylor Hall 80 27 53 Alex Ovechkin 79 51 28 Joe Pavelski 79 41 38 Jamie Benn 79 34 45 Nicklas Backstrom 79 18 61 Patrick Sharp 78 34 44 Joe Thornton 76 11 65 Erik Karlsson 74 20 54 Evgeni Malkin 72 23 49 Patrick Marleau 70 33 37 Anze Kopitar 70 29 41 Matt Duchene 70 23 47 Martin St. Louis 69 30 39 Patrick Kane 69 29 40 Blake Wheeler 69 28 41 Kyle Okposo 69 27 42 David Krejci 69 19 50 Chris Kunitz 68 35 33 Jonathan Toews 68 28 40 Thomas Vanek 68 27 41 Jaromir Jagr 67 24 43 John Tavares 66 24 42 Jason Spezza 66 23 43 Jordan Eberle 65 28 37
for row in top30_table.where("(points > 75) & (points <= 80)"):
print_playerstat(row)
Phil Kessel 80 37 43 Taylor Hall 80 27 53 Alex Ovechkin 79 51 28 Joe Pavelski 79 41 38 Jamie Benn 79 34 45 Nicklas Backstrom 79 18 61 Patrick Sharp 78 34 44 Joe Thornton 76 11 65
for row in top30_table.where("(goals > 40) & (points < 80)"):
print_playerstat(row)
Alex Ovechkin 79 51 28 Joe Pavelski 79 41 38
f
File(filename=playerstats-2013-2014.h5, title='', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None)) / (RootGroup) '' /season_2013_2014 (Group) 'NHL player statistics for the 2013/2014 season' /season_2013_2014/top30 (Table(30,)) 'Top 30 point leaders' description := { "assists": UInt16Col(shape=(), dflt=0, pos=0), "games_played": UInt8Col(shape=(), dflt=0, pos=1), "goals": UInt16Col(shape=(), dflt=0, pos=2), "player": StringCol(itemsize=20, shape=(), dflt=b'', pos=3), "points": UInt16Col(shape=(), dflt=0, pos=4), "position": StringCol(itemsize=1, shape=(), dflt=b'C', pos=5), "shifts_per_game_played": Float64Col(shape=(), dflt=0.0, pos=6), "shooting_percentage": Float64Col(shape=(), dflt=0.0, pos=7)} byteorder := 'little' chunkshape := (1489,)
f.flush()
f.close()
!h5ls -rv playerstats-2013-2014.h5
Opened "playerstats-2013-2014.h5" with sec2 driver. / Group Attribute: CLASS scalar Type: 5-byte null-terminated UTF-8 string Data: "GROUP" Attribute: PYTABLES_FORMAT_VERSION scalar Type: 3-byte null-terminated UTF-8 string Data: "2.1" Attribute: TITLE null Type: 1-byte null-terminated UTF-8 string Attribute: VERSION scalar Type: 3-byte null-terminated UTF-8 string Data: "1.0" Location: 1:96 Links: 1 /season_2013_2014 Group Attribute: CLASS scalar Type: 5-byte null-terminated UTF-8 string Data: "GROUP" Attribute: TITLE scalar Type: 46-byte null-terminated UTF-8 string Data: "NHL player statistics for the 2013/2014 season" Attribute: VERSION scalar Type: 3-byte null-terminated UTF-8 string Data: "1.0" Location: 1:1024 Links: 1 /season_2013_2014/top30 Dataset {30/Inf} Attribute: CLASS scalar Type: 5-byte null-terminated UTF-8 string Data: "TABLE" Attribute: FIELD_0_FILL scalar Type: native unsigned short Data: 0 Attribute: FIELD_0_NAME scalar Type: 7-byte null-terminated UTF-8 string Data: "assists" Attribute: FIELD_1_FILL scalar Type: native unsigned char Data: 0 Attribute: FIELD_1_NAME scalar Type: 12-byte null-terminated UTF-8 string Data: "games_played" Attribute: FIELD_2_FILL scalar Type: native unsigned short Data: 0 Attribute: FIELD_2_NAME scalar Type: 5-byte null-terminated UTF-8 string Data: "goals" Attribute: FIELD_3_FILL scalar Type: 1-byte null-terminated ASCII string Data: "" Attribute: FIELD_3_NAME scalar Type: 6-byte null-terminated UTF-8 string Data: "player" Attribute: FIELD_4_FILL scalar Type: native unsigned short Data: 0 Attribute: FIELD_4_NAME scalar Type: 6-byte null-terminated UTF-8 string Data: "points" Attribute: FIELD_5_FILL scalar Type: 1-byte null-terminated ASCII string Data: "C" Attribute: FIELD_5_NAME scalar Type: 8-byte null-terminated UTF-8 string Data: "position" Attribute: FIELD_6_FILL scalar Type: native double Data: 0 Attribute: FIELD_6_NAME scalar Type: 22-byte null-terminated UTF-8 string Data: "shifts_per_game_played" Attribute: FIELD_7_FILL scalar Type: native double Data: 0 Attribute: FIELD_7_NAME scalar Type: 19-byte null-terminated UTF-8 string Data: "shooting_percentage" Attribute: NROWS scalar Type: native long Data: 30 Attribute: TITLE scalar Type: 20-byte null-terminated UTF-8 string Data: "Top 30 point leaders" Attribute: VERSION scalar Type: 3-byte null-terminated UTF-8 string Data: "2.7" Location: 1:2264 Links: 1 Chunks: {1489} 65516 bytes Storage: 1320 logical bytes, 65516 allocated bytes, 2.01% utilization Type: struct { "assists" +0 native unsigned short "games_played" +2 native unsigned char "goals" +3 native unsigned short "player" +5 20-byte null-terminated ASCII string "points" +25 native unsigned short "position" +27 1-byte null-terminated ASCII string "shifts_per_game_played" +28 native double "shooting_percentage" +36 native double } 44 bytes H5tools-DIAG: Error detected in HDF5:tools (1.8.14) thread 0: #000: h5tools_dump.c line 1843 in h5tools_dump_mem(): H5Sis_simple failed major: Failure in tools library minor: error in function
import pandas as pd
store = pd.HDFStore('store.h5')
df = pd.DataFrame(np.random.rand(5,5))
store["df1"] = df
df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)
store["df2"] = df
store.keys()
['/df1', '/df2']
'df2' in store
True
df = store["df1"]
store.root
/ (RootGroup) '' children := ['df1' (Group), 'df2' (Group)]
store.close()
f = h5py.File("store.h5")
f.visititems(lambda x, y: print(x, "\t" * int(3 - len(str(x))//8), y))
df1 <HDF5 group "/df1" (4 members)> df1/axis0 <HDF5 dataset "axis0": shape (5,), type "<i8"> df1/axis1 <HDF5 dataset "axis1": shape (5,), type "<i8"> df1/block0_items <HDF5 dataset "block0_items": shape (5,), type "<i8"> df1/block0_values <HDF5 dataset "block0_values": shape (5, 5), type "<f8"> df2 <HDF5 group "/df2" (8 members)> df2/axis0 <HDF5 dataset "axis0": shape (21,), type "|S8"> df2/axis1 <HDF5 dataset "axis1": shape (30,), type "<i8"> df2/block0_items <HDF5 dataset "block0_items": shape (3,), type "|S8"> df2/block0_values <HDF5 dataset "block0_values": shape (30, 3), type "<f8"> df2/block1_items <HDF5 dataset "block1_items": shape (14,), type "|S4"> df2/block1_values <HDF5 dataset "block1_values": shape (30, 14), type "<i8"> df2/block2_items <HDF5 dataset "block2_items": shape (4,), type "|S6"> df2/block2_values <HDF5 dataset "block2_values": shape (1,), type "|O">
f["/df2/block0_items"].value
array([b'S%', b'Shift/GP', b'FO%'], dtype='|S8')
f["/df2/block0_values"][:3]
array([[13.9, 24. , 52.5], [15.2, 25.2, 49. ], [12.6, 25.1, 52.9]])
f["/df2/block1_items"].value
array([b'Rank', b'GP', b'G', b'A', b'P', b'+/-', b'PIM', b'PPG', b'PPP', b'SHG', b'SHP', b'GW', b'OT', b'S'], dtype='|S4')
f["/df2/block1_values"][:3, :5]
array([[ 1, 80, 36, 68, 104], [ 2, 77, 31, 56, 87], [ 3, 82, 28, 58, 86]])
data = ["string", 1.0, 2, None]
data_json = json.dumps(data)
data_json
'["string", 1.0, 2, null]'
data2 = json.loads(data_json)
data
['string', 1.0, 2, None]
data[0]
'string'
data = {"one": 1, "two": 2.0, "three": "three"}
data_json = json.dumps(data)
print(data_json)
{"one": 1, "two": 2.0, "three": "three"}
data = json.loads(data_json)
data["two"]
2.0
data["three"]
'three'
data = {"one": [1],
"two": [1, 2],
"three": [1, 2, 3]}
data_json = json.dumps(data, indent=True)
print(data_json)
{ "one": [ 1 ], "two": [ 1, 2 ], "three": [ 1, 2, 3 ] }
data = {"one": [1],
"two": {"one": 1, "two": 2},
"three": [(1,), (1, 2), (1, 2, 3)],
"four": "a text string"}
with open("data.json", "w") as f:
json.dump(data, f)
!cat data.json
{"one": [1], "two": {"one": 1, "two": 2}, "three": [[1], [1, 2], [1, 2, 3]], "four": "a text string"}
with open("data.json", "r") as f:
data_from_file = json.load(f)
data_from_file["two"]
{'one': 1, 'two': 2}
data_from_file["three"]
[[1], [1, 2], [1, 2, 3]]
!head -n 20 tokyo-metro.json
{ "C": { "color": "#149848", "transfers": [ [ "C3", "F15" ], [ "C4", "Z2" ], [ "C4", "G2" ], [ "C7", "M14" ],
!wc tokyo-metro.json
1471 1508 27638 tokyo-metro.json
with open("tokyo-metro.json", "r") as f:
data = json.load(f)
data.keys()
dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])
data["C"].keys()
dict_keys(['color', 'transfers', 'travel_times'])
data["C"]["color"]
'#149848'
data["C"]["transfers"]
[['C3', 'F15'], ['C4', 'Z2'], ['C4', 'G2'], ['C7', 'M14'], ['C7', 'N6'], ['C7', 'G6'], ['C8', 'M15'], ['C8', 'H6'], ['C9', 'H7'], ['C9', 'Y18'], ['C11', 'T9'], ['C11', 'M18'], ['C11', 'Z8'], ['C12', 'M19'], ['C18', 'H21']]
[(s, e, tt) for s, e, tt in data["C"]["travel_times"] if tt == 1]
[('C3', 'C4', 1), ('C7', 'C8', 1), ('C9', 'C10', 1)]
data
{'C': {'color': '#149848', 'transfers': [['C3', 'F15'], ['C4', 'Z2'], ['C4', 'G2'], ['C7', 'M14'], ['C7', 'N6'], ['C7', 'G6'], ['C8', 'M15'], ['C8', 'H6'], ['C9', 'H7'], ['C9', 'Y18'], ['C11', 'T9'], ['C11', 'M18'], ['C11', 'Z8'], ['C12', 'M19'], ['C18', 'H21']], 'travel_times': [['C1', 'C2', 2], ['C2', 'C3', 2], ['C3', 'C4', 1], ['C4', 'C5', 2], ['C5', 'C6', 2], ['C6', 'C7', 2], ['C7', 'C8', 1], ['C8', 'C9', 3], ['C9', 'C10', 1], ['C10', 'C11', 2], ['C11', 'C12', 2], ['C12', 'C13', 2], ['C13', 'C14', 2], ['C14', 'C15', 2], ['C15', 'C16', 2], ['C16', 'C17', 3], ['C17', 'C18', 3], ['C18', 'C19', 3]]}, 'G': {'color': '#f59230', 'transfers': [['G1', 'Z1'], ['G1', 'F16'], ['G2', 'Z2'], ['G2', 'C4'], ['G4', 'Z3'], ['G5', 'M13'], ['G5', 'Y16'], ['G5', 'Z4'], ['G5', 'N7'], ['G6', 'N6'], ['G6', 'M14'], ['G6', 'C7'], ['G9', 'M16'], ['G9', 'H8'], ['G11', 'T10'], ['G12', 'Z9'], ['G15', 'H16'], ['G16', 'H17']], 'travel_times': [['G1', 'G2', 2], ['G2', 'G3', 1], ['G3', 'G4', 2], ['G4', 'G5', 2], ['G5', 'G6', 2], ['G6', 'G7', 2], ['G7', 'G8', 2], ['G8', 'G9', 2], ['G9', 'G10', 1], ['G10', 'G11', 2], ['G11', 'G12', 2], ['G12', 'G13', 1], ['G13', 'G14', 2], ['G14', 'G15', 2], ['G15', 'G16', 1], ['G16', 'G17', 2], ['G17', 'G18', 1], ['G18', 'G19', 2]]}, 'F': {'color': '#b96528', 'transfers': [['F1', 'Y1'], ['F2', 'Y2'], ['F3', 'Y3'], ['F4', 'Y4'], ['F5', 'Y5'], ['F6', 'Y6'], ['F7', 'Y7'], ['F8', 'Y8'], ['F9', 'Y9'], ['F9', 'M25'], ['F13', 'M9'], ['F15', 'C3'], ['F16', 'Z1'], ['F16', 'G1']], 'travel_times': [['F1', 'F2', 3], ['F2', 'F3', 2], ['F3', 'F4', 3], ['F4', 'F5', 2], ['F5', 'F6', 2], ['F6', 'F7', 2], ['F7', 'F8', 2], ['F8', 'F9', 2], ['F9', 'F10', 3], ['F10', 'F11', 2], ['F11', 'F12', 2], ['F12', 'F13', 2], ['F13', 'F14', 3], ['F14', 'F15', 2], ['F15', 'F16', 2]]}, 'H': {'color': '#9cacb5', 'transfers': [['H6', 'M15'], ['H6', 'C8'], ['H7', 'Y18'], ['H7', 'C9'], ['H8', 'M16'], ['H8', 'G9'], ['H12', 'T11'], ['H16', 'G15'], ['H17', 'G16'], ['H21', 'C18']], 'travel_times': [['H1', 'H2', 3], ['H2', 'H3', 3], ['H3', 'H4', 3], ['H4', 'H5', 3], ['H5', 'H6', 2], ['H6', 'H7', 3], ['H7', 'H8', 1], ['H8', 'H9', 2], ['H9', 'H10', 2], ['H10', 'H11', 2], ['H11', 'H12', 1], ['H12', 'H13', 3], ['H13', 'H14', 1], ['H14', 'H15', 2], ['H15', 'H16', 2], ['H16', 'H17', 1], ['H17', 'H18', 2], ['H18', 'H19', 2], ['H19', 'H20', 2], ['H20', 'H21', 3]]}, 'M': {'color': '#ff0000', 'transfers': [['M9', 'F13'], ['M12', 'N8'], ['M13', 'G5'], ['M13', 'Y16'], ['M13', 'Z4'], ['M13', 'N7'], ['M14', 'C7'], ['M14', 'G6'], ['M14', 'N6'], ['M15', 'H6'], ['M15', 'C8'], ['M16', 'G9'], ['M16', 'H8'], ['M18', 'T9'], ['M18', 'C11'], ['M18', 'Z8'], ['M19', 'C12'], ['M22', 'N11'], ['M25', 'Y9'], ['M25', 'F9']], 'travel_times': [['M1', 'M2', 2], ['M2', 'M3', 2], ['M3', 'M4', 2], ['M4', 'M5', 2], ['M5', 'M6', 2], ['M6', 'M7', 2], ['M7', 'M8', 2], ['M8', 'M9', 2], ['M9', 'M10', 1], ['M10', 'M11', 2], ['M11', 'M12', 2], ['M12', 'M13', 3], ['M13', 'M14', 2], ['M14', 'M15', 1], ['M15', 'M16', 3], ['M16', 'M17', 2], ['M17', 'M18', 2], ['M18', 'M19', 2], ['M19', 'M20', 1], ['M20', 'M21', 2], ['M21', 'M22', 2], ['M22', 'M23', 3], ['M23', 'M24', 2], ['M24', 'M25', 3], ['m3', 'm4', 2], ['m4', 'm5', 2], ['m5', 'M6', 2]]}, 'N': {'color': '#1aaca9', 'transfers': [['N1', 'T1'], ['N2', 'T2'], ['N3', 'T3'], ['N6', 'G6'], ['N6', 'M14'], ['N6', 'C7'], ['N7', 'Y16'], ['N7', 'Z4'], ['N7', 'G5'], ['N7', 'M13'], ['N8', 'M12'], ['N9', 'Y14'], ['N10', 'Y13'], ['N10', 'T6'], ['N11', 'M22']], 'travel_times': [['N1', 'N2', 2], ['N2', 'N3', 2], ['N3', 'N4', 2], ['N4', 'N5', 2], ['N5', 'N6', 2], ['N6', 'N7', 2], ['N7', 'N8', 2], ['N8', 'N9', 2], ['N9', 'N10', 2], ['N10', 'N11', 2], ['N11', 'N12', 3], ['N12', 'N13', 2], ['N13', 'N14', 2], ['N14', 'N15', 3], ['N15', 'N16', 1], ['N16', 'N17', 3], ['N17', 'N18', 2], ['N18', 'N19', 2]]}, 'T': {'color': '#1aa7d8', 'transfers': [['T6', 'N10'], ['T6', 'Y13'], ['T7', 'Z6'], ['T9', 'M18'], ['T9', 'C11'], ['T9', 'Z8'], ['T10', 'G11'], ['T11', 'H12']], 'travel_times': [['T1', 'T2', 0], ['T2', 'T3', 3], ['T3', 'T4', 6], ['T4', 'T5', 9], ['T5', 'T6', 11], ['T6', 'T7', 13], ['T7', 'T8', 14], ['T8', 'T9', 16], ['T9', 'T10', 18], ['T10', 'T11', 20], ['T11', 'T12', 21], ['T12', 'T13', 24], ['T13', 'T14', 26], ['T14', 'T15', 27], ['T15', 'T16', 30], ['T16', 'T17', 33], ['T17', 'T18', 35], ['T18', 'T19', 37], ['T19', 'T20', 39], ['T20', 'T21', 41], ['T21', 'T22', 43], ['T22', 'T23', 46], ['T23', 'T24', 49]]}, 'Y': {'color': '#ede7c3', 'transfers': [['Y1', 'F1'], ['Y2', 'F2'], ['Y3', 'F3'], ['Y4', 'F4'], ['Y5', 'F5'], ['Y6', 'F6'], ['Y7', 'F7'], ['Y8', 'F8'], ['Y9', 'F9'], ['Y9', 'M25'], ['Y13', 'T6'], ['Y13', 'N10'], ['Y14', 'N9'], ['Y16', 'Z4'], ['Y16', 'N7'], ['Y16', 'G5'], ['Y16', 'M13'], ['Y18', 'H7'], ['Y18', 'C9']], 'travel_times': [['Y1', 'Y2', 4], ['Y2', 'Y3', 2], ['Y3', 'Y4', 3], ['Y4', 'Y5', 2], ['Y5', 'Y6', 2], ['Y6', 'Y7', 2], ['Y7', 'Y8', 2], ['Y8', 'Y9', 3], ['Y9', 'Y10', 2], ['Y10', 'Y11', 2], ['Y11', 'Y12', 2], ['Y12', 'Y13', 3], ['Y13', 'Y14', 2], ['Y14', 'Y15', 2], ['Y15', 'Y16', 1], ['Y16', 'Y17', 2], ['Y17', 'Y18', 2], ['Y18', 'Y19', 2], ['Y19', 'Y20', 2], ['Y20', 'Y21', 2], ['Y21', 'Y22', 2], ['Y22', 'Y23', 3], ['Y23', 'Y24', 2]]}, 'Z': {'color': '#a384bf', 'transfers': [['Z1', 'F16'], ['Z1', 'G1'], ['Z2', 'C4'], ['Z2', 'G2'], ['Z3', 'G4'], ['Z4', 'Y16'], ['Z4', 'N7'], ['Z4', 'M13'], ['Z4', 'G5'], ['Z6', 'T7'], ['Z8', 'M18'], ['Z8', 'C11'], ['Z8', 'T9'], ['Z9', 'G12']], 'travel_times': [['Z1', 'Z2', 3], ['Z2', 'Z3', 2], ['Z3', 'Z4', 2], ['Z4', 'Z5', 2], ['Z5', 'Z6', 2], ['Z6', 'Z7', 2], ['Z7', 'Z8', 2], ['Z8', 'Z9', 2], ['Z9', 'Z10', 3], ['Z10', 'Z11', 3], ['Z11', 'Z12', 3], ['Z12', 'Z13', 2], ['Z13', 'Z14', 2]]}}
!ls -lh tokyo-metro.json
-rw-r--r-- 1 rob staff 27K Mar 25 2018 tokyo-metro.json
data_pack = msgpack.packb(data)
# del data
type(data_pack)
bytes
len(data_pack)
3021
with open("tokyo-metro.msgpack", "wb") as f:
f.write(data_pack)
!ls -lh tokyo-metro.msgpack
-rw-r--r-- 1 rob staff 3.0K May 6 16:12 tokyo-metro.msgpack
with open("tokyo-metro.msgpack", "rb") as f:
data_msgpack = f.read()
data = msgpack.unpackb(data_msgpack)
list(data.keys())
[b'C', b'G', b'F', b'H', b'M', b'N', b'T', b'Y', b'Z']
with open("tokyo-metro.pickle", "wb") as f:
pickle.dump(data, f)
del data
!ls -lh tokyo-metro.pickle
-rw-r--r-- 1 rob staff 8.5K May 6 16:12 tokyo-metro.pickle
with open("tokyo-metro.pickle", "rb") as f:
data = pickle.load(f)
data.keys()
dict_keys([b'C', b'G', b'F', b'H', b'M', b'N', b'T', b'Y', b'Z'])
%reload_ext version_information
%version_information numpy, pandas, csv, json, tables, h5py, msgpack
Software | Version |
---|---|
Python | 3.6.8 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] |
IPython | 7.5.0 |
OS | Darwin 18.2.0 x86_64 i386 64bit |
numpy | 1.16.3 |
pandas | 0.24.2 |
csv | 1.0 |
json | 2.0.9 |
tables | 3.5.1 |
h5py | 2.9.0 |
msgpack | 0.6.1 |
Mon May 06 16:13:03 2019 JST |