#!/usr/bin/env python # coding: utf-8 # # Chapter 18: Code listing # Robert Johansson # # Source code listings for [Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib](https://www.apress.com/us/book/9781484242452) (ISBN 978-1-484242-45-2). # ## Imports # In[1]: from __future__ import print_function # In[2]: import numpy as np np.random.seed(0) # In[3]: import pandas as pd # In[4]: import csv # In[5]: import json # In[6]: import h5py # In[7]: import tables # In[9]: import pickle # import cPickle # In[10]: import msgpack # # CSV # In[11]: get_ipython().run_cell_magic('writefile', 'playerstats-2013-2014.csv', '# 2013-2014 / Regular Season / All Skaters / Summary / Points\nRank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%\n1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5\n2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0\n3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9\n4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5\n5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0\n') # In[12]: get_ipython().run_cell_magic('writefile', 'playerstats-2013-2014-top30.csv', '# 2013-2014 / Regular Season / All Skaters / Summary / Points\nRank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%\n1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5\n2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0\n3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9\n4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5\n5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0\n6,Phil Kessel,TOR,R,82,37,43,80,-5,27,8,20,0,0,6,0,305,12.1,20:39,24.5,14.3\n7,Taylor Hall,EDM,L,75,27,53,80,-15,44,7,17,0,1,1,1,250,10.8,20:00,25.4,45.7\n8,Alex Ovechkin,WSH,L,78,51,28,79,-35,48,24,39,0,1,10,3,386,13.2,20:32,21.8,66.7\n9,Joe Pavelski,SJS,C,82,41,38,79,+23,32,16,31,1,2,3,0,225,18.2,19:51,27.1,56.0\n10,Jamie Benn,DAL,L,81,34,45,79,+21,64,5,19,1,3,3,1,279,12.2,19:09,25.0,52.8\n11,Nicklas Backstrom,WSH,C,82,18,61,79,-20,54,6,44,1,1,1,0,196,9.2,19:48,23.3,50.4\n12,Patrick Sharp,CHI,L,82,34,44,78,+13,40,10,25,0,0,3,1,313,10.9,18:53,22.7,54.6\n13,Joe Thornton,SJS,C,82,11,65,76,+20,32,2,19,0,1,3,1,122,9.0,18:55,26.3,56.1\n14,Erik Karlsson,OTT,D,82,20,54,74,-15,36,5,31,0,0,1,0,257,7.8,27:04,28.6,0.0\n15,Evgeni Malkin,PIT,C,60,23,49,72,+10,62,7,30,0,0,3,0,191,12.0,20:03,21.4,48.8\n16,Patrick Marleau,SJS,L,82,33,37,70,+0,18,11,23,2,2,4,0,285,11.6,20:31,27.3,52.9\n17,Anze Kopitar,LAK,C,82,29,41,70,+34,24,10,23,0,0,9,2,200,14.5,20:53,25.4,53.3\n18,Matt Duchene,COL,C,71,23,47,70,+8,19,5,17,0,0,6,1,217,10.6,18:29,22.0,50.3\n19,Martin St. Louis,"TBL, NYR",R,81,30,39,69,+13,10,9,21,1,2,5,1,204,14.7,20:56,25.7,40.7\n20,Patrick Kane,CHI,R,69,29,40,69,+7,22,10,25,0,0,6,0,227,12.8,19:36,22.9,50.0\n21,Blake Wheeler,WPG,R,82,28,41,69,+4,63,8,19,0,0,4,2,225,12.4,18:41,24.0,37.5\n22,Kyle Okposo,NYI,R,71,27,42,69,-9,51,5,15,0,0,4,1,195,13.8,20:26,22.2,47.5\n23,David Krejci,BOS,C,80,19,50,69,+39,28,3,19,0,0,6,1,169,11.2,19:07,21.3,51.2\n24,Chris Kunitz,PIT,L,78,35,33,68,+25,66,13,22,0,0,8,0,218,16.1,19:09,22.2,75.0\n25,Jonathan Toews,CHI,C,76,28,40,68,+26,34,5,15,3,5,5,0,193,14.5,20:28,25.9,57.2\n26,Thomas Vanek,"BUF, NYI, MTL",L,78,27,41,68,+7,46,8,18,0,0,4,0,248,10.9,19:21,21.6,43.5\n27,Jaromir Jagr,NJD,R,82,24,43,67,+16,46,5,17,0,0,6,1,231,10.4,19:09,22.8,0.0\n28,John Tavares,NYI,C,59,24,42,66,-6,40,8,25,0,0,4,0,188,12.8,21:14,22.3,49.1\n29,Jason Spezza,OTT,C,75,23,43,66,-26,46,9,22,0,0,5,0,223,10.3,18:12,23.8,54.0\n30,Jordan Eberle,EDM,R,80,28,37,65,-11,18,7,20,1,1,4,1,200,14.0,19:32,25.4,38.1\n') # In[13]: get_ipython().system('head -n 5 playerstats-2013-2014-top30.csv') # In[14]: rows = [] # In[15]: with open("playerstats-2013-2014.csv") as f: csvreader = csv.reader(f) rows = [fields for fields in csvreader] # In[16]: rows[1][1:6] # In[17]: rows[2][1:6] # In[18]: data = np.random.randn(100, 3) # In[19]: np.savetxt("data.csv", data, delimiter=",", header="x, y, z", comments="# Random x, y, z coordinates\n") # In[20]: get_ipython().system('head -n 5 data.csv') # In[21]: data_load = np.loadtxt("data.csv", skiprows=2, delimiter=",") # In[22]: data_load[1,:] # In[23]: data_load.dtype # In[24]: (data == data_load).all() # In[29]: data = np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", dtype=bytes) # In[30]: data[0][1:6] # In[31]: np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", usecols=[6,7,8]) # In[32]: df = pd.read_csv("playerstats-2013-2014.csv", skiprows=1) # In[33]: df = df.set_index("Rank") # In[34]: df[["Player", "GP", "G", "A", "P"]] # In[35]: df.info() # In[36]: df[["Player", "GP", "G", "A", "P"]].to_csv("playerstats-2013-2014-subset.csv") # In[37]: get_ipython().system('head -n 5 playerstats-2013-2014-subset.csv') # # HDF5 # ## h5py # In[38]: import h5py # In[39]: # mode = "w", "r", "w-", "r+", "a" # In[40]: f = h5py.File("data.h5", "w") # In[41]: f.mode # In[42]: f.flush() # In[43]: f.close() # In[44]: f = h5py.File("data.h5", "w") # In[45]: f.name # In[46]: grp1 = f.create_group("experiment1") # In[47]: grp1.name # In[48]: grp2_meas = f.create_group("experiment2/measurement") # In[49]: grp2_meas.name # In[50]: grp2_sim = f.create_group("experiment2/simulation") # In[51]: grp2_sim.name # In[52]: f["/experiment1"] # In[53]: f["/experiment2/simulation"] # In[54]: grp_expr2 = f["/experiment2"] # In[55]: grp_expr2['simulation'] # In[56]: list(f.keys()) # In[57]: list(f.items()) # In[58]: f.visit(lambda x: print(x)) # In[59]: f.visititems(lambda name, value: print(name, value)) # In[60]: "experiment1" in f # In[61]: "simulation" in f["experiment2"] # In[62]: "experiment3" in f # In[63]: f.flush() # In[64]: get_ipython().system('h5ls -r data.h5') # In[65]: data1 = np.arange(10) # In[66]: data2 = np.random.randn(100, 100) # In[67]: f["array1"] = data1 # In[68]: f["/experiment2/measurement/meas1"] = data2 # In[69]: f.visititems(lambda name, value: print(name, value)) # In[70]: ds = f["array1"] # In[71]: ds # In[72]: ds.name # In[73]: ds.dtype # In[74]: ds.shape # In[75]: ds.len() # In[76]: ds.value # In[77]: ds = f["/experiment2/measurement/meas1"] # In[78]: ds # In[79]: ds.dtype # In[80]: ds.shape # In[81]: data_full = ds[...] # In[82]: type(data_full) # In[83]: data_full.shape # In[84]: data_col = ds[:, 0] # In[85]: data_col.shape # In[86]: ds[10:20:3, 10:20:3] # In[87]: ds[[1,2,3], :].shape # In[88]: ds[[1,2,3], :].shape # In[89]: mask = ds[:, 0] > 2.0 # In[90]: mask.shape, mask.dtype # In[91]: ds[mask, 0] # In[92]: ds[mask, :5] # In[93]: # create empty data sets, assign and update datasets # In[94]: ds = f.create_dataset("array2", data=np.random.randint(10, size=10)) # In[95]: ds # In[96]: ds.value # In[97]: ds = f.create_dataset("/experiment2/simulation/data1", shape=(5, 5), fillvalue=-1) # In[98]: ds # In[99]: ds.value # In[100]: ds = f.create_dataset("/experiment1/simulation/data1", shape=(5000, 5000, 5000), fillvalue=0, compression='gzip') # In[101]: ds # In[102]: ds[:, 0, 0] = np.random.rand(5000) # In[103]: ds[1, :, 0] += np.random.rand(5000) # In[104]: ds[:2, :5, 0] # In[105]: ds.fillvalue # In[106]: f["experiment1"].visititems(lambda name, value: print(name, value)) # In[107]: float(np.prod(ds.shape) * ds[0,0,0].nbytes) / (1024**3) # Gb # In[108]: f.flush() # In[109]: f.filename # In[110]: get_ipython().system('ls -lh data.h5') # In[111]: del f["/experiment1/simulation/data1"] # In[112]: f["experiment1"].visititems(lambda name, value: print(name, value)) # In[113]: f.close() # In[114]: # attributes # In[115]: f = h5py.File("data.h5") # In[116]: f.attrs # In[117]: f.attrs["desc"] = "Result sets from experiments and simulations" # In[118]: f["experiment1"].attrs["date"] = "2015-1-1" # In[119]: f["experiment2"].attrs["date"] = "2015-1-2" # In[120]: f["experiment2/simulation/data1"].attrs["k"] = 1.5 # In[121]: f["experiment2/simulation/data1"].attrs["T"] = 1000 # In[122]: list(f["experiment1"].attrs.keys()) # In[123]: list(f["experiment2/simulation/data1"].attrs.items()) # In[124]: "T" in f["experiment2/simulation/data1"].attrs # In[125]: del f["experiment2/simulation/data1"].attrs["T"] # In[126]: "T" in f["experiment2/simulation/data1"].attrs # In[127]: f["experiment2/simulation/data1"].attrs["t"] = np.array([1, 2, 3]) # In[128]: f["experiment2/simulation/data1"].attrs["t"] # In[129]: f.close() # ## pytables # In[130]: df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1) df = df.set_index("Rank") # In[131]: df[["Player", "Pos", "GP", "P", "G", "A", "S%", "Shift/GP"]].head(5) # In[132]: f = tables.open_file("playerstats-2013-2014.h5", mode="w") # In[133]: grp = f.create_group("/", "season_2013_2014", title="NHL player statistics for the 2013/2014 season") # In[134]: grp # In[135]: f.root # In[136]: class PlayerStat(tables.IsDescription): player = tables.StringCol(20, dflt="") position = tables.StringCol(1, dflt="C") games_played = tables.UInt8Col(dflt=0) points = tables.UInt16Col(dflt=0) goals = tables.UInt16Col(dflt=0) assists = tables.UInt16Col(dflt=0) shooting_percentage = tables.Float64Col(dflt=0.0) shifts_per_game_played = tables.Float64Col(dflt=0.0) # In[137]: top30_table = f.create_table(grp, 'top30', PlayerStat, "Top 30 point leaders") # In[138]: playerstat = top30_table.row # In[139]: type(playerstat) # In[140]: for index, row_series in df.iterrows(): playerstat["player"] = row_series["Player"] playerstat["position"] = row_series["Pos"] playerstat["games_played"] = row_series["GP"] playerstat["points"] = row_series["P"] playerstat["goals"] = row_series["G"] playerstat["assists"] = row_series["A"] playerstat["shooting_percentage"] = row_series["S%"] playerstat["shifts_per_game_played"] = row_series["Shift/GP"] playerstat.append() # In[141]: top30_table.flush() # In[142]: top30_table.cols.player[:5] # In[143]: top30_table.cols.points[:5] # In[144]: def print_playerstat(row): print("%20s\t%s\t%s\t%s" % (row["player"].decode('UTF-8'), row["points"], row["goals"], row["assists"])) # In[145]: for row in top30_table.iterrows(): print_playerstat(row) # In[146]: for row in top30_table.where("(points > 75) & (points <= 80)"): print_playerstat(row) # In[147]: for row in top30_table.where("(goals > 40) & (points < 80)"): print_playerstat(row) # In[148]: f # In[149]: f.flush() # In[150]: f.close() # In[151]: get_ipython().system('h5ls -rv playerstats-2013-2014.h5') # ## Pandas hdfstore # In[152]: import pandas as pd # In[153]: store = pd.HDFStore('store.h5') # In[154]: df = pd.DataFrame(np.random.rand(5,5)) # In[155]: store["df1"] = df # In[156]: df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1) # In[157]: store["df2"] = df # In[158]: store.keys() # In[159]: 'df2' in store # In[160]: df = store["df1"] # In[161]: store.root # In[162]: store.close() # In[163]: f = h5py.File("store.h5") # In[164]: f.visititems(lambda x, y: print(x, "\t" * int(3 - len(str(x))//8), y)) # In[165]: f["/df2/block0_items"].value # In[166]: f["/df2/block0_values"][:3] # In[167]: f["/df2/block1_items"].value # In[168]: f["/df2/block1_values"][:3, :5] # # JSON # In[169]: data = ["string", 1.0, 2, None] # In[170]: data_json = json.dumps(data) # In[171]: data_json # In[172]: data2 = json.loads(data_json) # In[173]: data # In[174]: data[0] # In[175]: data = {"one": 1, "two": 2.0, "three": "three"} # In[176]: data_json = json.dumps(data) # In[177]: print(data_json) # In[178]: data = json.loads(data_json) # In[179]: data["two"] # In[180]: data["three"] # In[181]: data = {"one": [1], "two": [1, 2], "three": [1, 2, 3]} # In[182]: data_json = json.dumps(data, indent=True) # In[183]: print(data_json) # In[184]: data = {"one": [1], "two": {"one": 1, "two": 2}, "three": [(1,), (1, 2), (1, 2, 3)], "four": "a text string"} # In[185]: with open("data.json", "w") as f: json.dump(data, f) # In[186]: get_ipython().system('cat data.json') # In[187]: with open("data.json", "r") as f: data_from_file = json.load(f) # In[188]: data_from_file["two"] # In[189]: data_from_file["three"] # In[190]: get_ipython().system('head -n 20 tokyo-metro.json') # In[191]: get_ipython().system('wc tokyo-metro.json') # In[211]: with open("tokyo-metro.json", "r") as f: data = json.load(f) # In[212]: data.keys() # In[213]: data["C"].keys() # In[214]: data["C"]["color"] # In[215]: data["C"]["transfers"] # In[216]: [(s, e, tt) for s, e, tt in data["C"]["travel_times"] if tt == 1] # In[217]: data # In[218]: get_ipython().system('ls -lh tokyo-metro.json') # In[219]: data_pack = msgpack.packb(data) # In[220]: # del data # In[221]: type(data_pack) # In[222]: len(data_pack) # In[223]: with open("tokyo-metro.msgpack", "wb") as f: f.write(data_pack) # In[224]: get_ipython().system('ls -lh tokyo-metro.msgpack') # In[225]: with open("tokyo-metro.msgpack", "rb") as f: data_msgpack = f.read() data = msgpack.unpackb(data_msgpack) # In[226]: list(data.keys()) # In[227]: with open("tokyo-metro.pickle", "wb") as f: pickle.dump(data, f) # In[228]: del data # In[229]: get_ipython().system('ls -lh tokyo-metro.pickle') # In[230]: with open("tokyo-metro.pickle", "rb") as f: data = pickle.load(f) # In[231]: data.keys() # # Versions # In[232]: get_ipython().run_line_magic('reload_ext', 'version_information') # In[233]: get_ipython().run_line_magic('version_information', 'numpy, pandas, csv, json, tables, h5py, msgpack')