Catherine Devlin (catherine.devlin@gmail.com, @catherinedevlin)
Postgres Open 2014
All About Recent Data-Handling Developments in Python
Some Recent Data-Handling Developments in Python
A Few Recent Data-Handling Developments in Python
(aka "Project Jupyter")
Browser-based enhanced Python environment
pip install ipython_sql
pip install psycopg2
%load_ext sql
%sql postgresql://:@/pgo
'Connected: @pgo'
%sql SELECT * FROM yob2010 LIMIT 5
5 rows affected.
name | sex | n_births |
---|---|---|
Isabella | F | 22731 |
Sophia | F | 20477 |
Emma | F | 17179 |
Olivia | F | 16860 |
Ava | F | 15300 |
%%sql
SELECT *
FROM yob2010
LIMIT 5
5 rows affected.
name | sex | n_births |
---|---|---|
Isabella | F | 22731 |
Sophia | F | 20477 |
Emma | F | 17179 |
Olivia | F | 16860 |
Ava | F | 15300 |
names = %sql SELECT * FROM yob2010
33838 rows affected.
names[2]
('Emma', 'F', 17179)
names[2].sex
'F'
WARNINGS:
%sql
import pandas as pd
!head sample_data/yob2000.txt
yob2000 = pd.read_csv('sample_data/yob2000.txt', names=['name','sex','n_births'])
yob2000.head()
name | sex | n_births | |
---|---|---|---|
0 | Emily | F | 25949 |
1 | Hannah | F | 23066 |
2 | Madison | F | 19965 |
3 | Ashley | F | 17991 |
4 | Sarah | F | 17677 |
pd.read_excel
<function pandas.io.excel.read_excel>
yob2000.head(6).T
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
name | Emily | Hannah | Madison | Ashley | Sarah | Alexis |
sex | F | F | F | F | F | F |
n_births | 25949 | 23066 | 19965 | 17991 | 17677 | 17622 |
yob2000.head(6).T.sum(axis=1)
name EmilyHannahMadisonAshleySarahAlexis sex FFFFFF n_births 122270 dtype: object
ls sample_data/yob*.txt
sample_data/yob2000.txt sample_data/yob2004.txt sample_data/yob2008.txt sample_data/yob2001.txt sample_data/yob2005.txt sample_data/yob2009.txt sample_data/yob2002.txt sample_data/yob2006.txt sample_data/yob2010.txt sample_data/yob2003.txt sample_data/yob2007.txt
all_names = pd.DataFrame()
for yr in range(2000, 2011):
one_yr = pd.read_csv('sample_data/yob%d.txt' % yr, names=['name','sex','n_births'])
one_yr['year'] = yr
all_names = all_names.append(one_yr, ignore_index=True)
all_names.head()
name | sex | n_births | year | |
---|---|---|---|---|
0 | Emily | F | 25949 | 2000 |
1 | Hannah | F | 23066 | 2000 |
2 | Madison | F | 19965 | 2000 |
3 | Ashley | F | 17991 | 2000 |
4 | Sarah | F | 17677 | 2000 |
all_names.tail()
name | sex | n_births | year | |
---|---|---|---|---|
358607 | Zymaire | M | 5 | 2010 |
358608 | Zyonne | M | 5 | 2010 |
358609 | Zyquarius | M | 5 | 2010 |
358610 | Zyran | M | 5 | 2010 |
358611 | Zzyzx | M | 5 | 2010 |
lesleys = all_names[all_names.name.str.startswith('Lesl')]
lesleys
name | sex | n_births | year | |
---|---|---|---|---|
89 | Leslie | F | 3619 | 2000 |
388 | Lesly | F | 742 | 2000 |
429 | Lesley | F | 658 | 2000 |
1313 | Lesli | F | 145 | 2000 |
2248 | Leslye | F | 70 | 2000 |
3618 | Leslee | F | 36 | 2000 |
6101 | Leslieann | F | 18 | 2000 |
8316 | Leslian | F | 12 | 2000 |
9578 | Leslyann | F | 10 | 2000 |
12558 | Leslei | F | 7 | 2000 |
12559 | Lesleigh | F | 7 | 2000 |
18697 | Leslie | M | 137 | 2000 |
22136 | Lesley | M | 17 | 2000 |
23802 | Lesly | M | 11 | 2000 |
29108 | Leslee | M | 5 | 2000 |
29840 | Leslie | F | 3610 | 2001 |
30128 | Lesly | F | 801 | 2001 |
30280 | Lesley | F | 509 | 2001 |
30977 | Lesli | F | 166 | 2001 |
31796 | Leslye | F | 83 | 2001 |
33908 | Leslee | F | 30 | 2001 |
40367 | Lesleigh | F | 9 | 2001 |
40368 | Leslieann | F | 9 | 2001 |
44318 | Leslyann | F | 6 | 2001 |
46575 | Leslei | F | 5 | 2001 |
46576 | Leslian | F | 5 | 2001 |
48845 | Leslie | M | 125 | 2001 |
54977 | Lesley | M | 9 | 2001 |
59331 | Lesly | M | 5 | 2001 |
60100 | Leslie | F | 3520 | 2002 |
... | ... | ... | ... | ... |
225151 | Leslee | F | 30 | 2007 |
230792 | Lesleigh | F | 11 | 2007 |
242592 | Leslie | M | 82 | 2007 |
251229 | Lesley | M | 7 | 2007 |
255312 | Leslie | F | 2323 | 2008 |
255616 | Lesly | F | 699 | 2008 |
256198 | Lesley | F | 253 | 2008 |
256809 | Lesli | F | 137 | 2008 |
257947 | Leslye | F | 65 | 2008 |
260822 | Leslee | F | 25 | 2008 |
269833 | Lesleigh | F | 7 | 2008 |
277489 | Leslie | M | 76 | 2008 |
289328 | Lesley | M | 5 | 2008 |
290331 | Leslie | F | 1975 | 2009 |
290673 | Lesly | F | 598 | 2009 |
291307 | Lesley | F | 222 | 2009 |
292364 | Lesli | F | 90 | 2009 |
294585 | Leslye | F | 34 | 2009 |
295620 | Leslee | F | 26 | 2009 |
312190 | Leslie | M | 76 | 2009 |
318337 | Lesley | M | 10 | 2009 |
324970 | Leslie | F | 1558 | 2010 |
325353 | Lesly | F | 502 | 2010 |
325892 | Lesley | F | 219 | 2010 |
327213 | Lesli | F | 76 | 2010 |
329662 | Leslye | F | 29 | 2010 |
331019 | Leslee | F | 21 | 2010 |
343300 | Lesleigh | F | 5 | 2010 |
347062 | Leslie | M | 47 | 2010 |
356014 | Lesley | M | 6 | 2010 |
119 rows × 4 columns
lesleys.pivot_table('n_births', rows='year',
cols='sex', aggfunc=sum)
/home/catherine/ana3/lib/python3.4/site-packages/pandas/util/decorators.py:53: FutureWarning: cols is deprecated, use columns instead warnings.warn(msg, FutureWarning) /home/catherine/ana3/lib/python3.4/site-packages/pandas/util/decorators.py:53: FutureWarning: rows is deprecated, use index instead warnings.warn(msg, FutureWarning)
sex | F | M |
---|---|---|
year | ||
2000 | 5324 | 170 |
2001 | 5233 | 139 |
2002 | 4994 | 114 |
2003 | 5187 | 121 |
2004 | 4970 | 106 |
2005 | 4552 | 122 |
2006 | 4412 | 94 |
2007 | 4052 | 89 |
2008 | 3509 | 81 |
2009 | 2945 | 86 |
2010 | 2410 | 53 |
gender = all_names.pivot_table('n_births', rows='name',
cols='sex', aggfunc=sum)
gender
sex | F | M |
---|---|---|
name | ||
Aaban | NaN | 20 |
Aabid | NaN | 5 |
Aabriella | 5 | NaN |
Aadam | NaN | 81 |
Aadan | NaN | 69 |
Aadarsh | NaN | 109 |
Aaden | 5 | 2972 |
Aadesh | NaN | 10 |
Aadhav | NaN | 29 |
Aadhavan | NaN | 5 |
Aadhya | 56 | NaN |
Aadi | 5 | 444 |
Aadil | NaN | 167 |
Aadin | NaN | 85 |
Aadison | 6 | NaN |
Aadit | NaN | 157 |
Aadith | NaN | 17 |
Aaditri | 5 | NaN |
Aaditya | NaN | 303 |
Aadon | NaN | 61 |
Aadya | 148 | NaN |
Aadyn | 16 | 243 |
Aafreen | 5 | NaN |
Aahan | NaN | 78 |
Aahana | 181 | NaN |
Aahil | NaN | 147 |
Aahliyah | 24 | NaN |
Aahna | 26 | NaN |
Aahron | NaN | 5 |
Aaidan | NaN | 37 |
... | ... | ... |
Zyran | NaN | 10 |
Zyree | 11 | 30 |
Zyreion | NaN | 5 |
Zyrek | NaN | 5 |
Zyrell | NaN | 56 |
Zyren | NaN | 37 |
Zyreon | NaN | 7 |
Zyrese | NaN | 5 |
Zyrhiana | 6 | NaN |
Zyria | 235 | NaN |
Zyriah | 193 | NaN |
Zyrian | NaN | 10 |
Zyriana | 17 | NaN |
Zyrianna | 18 | NaN |
Zyrie | 5 | NaN |
Zyriel | 5 | NaN |
Zyrihanna | 29 | NaN |
Zyrion | NaN | 45 |
Zyriyah | 6 | NaN |
Zyron | NaN | 104 |
Zyrus | NaN | 28 |
Zysean | NaN | 5 |
Zyshaun | NaN | 29 |
Zyshawn | NaN | 96 |
Zyshon | NaN | 11 |
Zyshonne | NaN | 45 |
Zytavious | NaN | 43 |
Zyvion | NaN | 5 |
Zyyanna | 6 | NaN |
Zzyzx | NaN | 5 |
57008 rows × 2 columns
gender['total'] = gender.F + gender.M
gender['ratio'] = gender.F / gender.total
gender.head()
sex | F | M | total | ratio |
---|---|---|---|---|
name | ||||
Aaban | NaN | 20 | NaN | NaN |
Aabid | NaN | 5 | NaN | NaN |
Aabriella | 5 | NaN | NaN | NaN |
Aadam | NaN | 81 | NaN | NaN |
Aadan | NaN | 69 | NaN | NaN |
gender['andro'] = -1 * (abs(0.5 - gender.ratio))
gender.sort('andro', ascending=False)[
gender.total > 10000].head()
/home/catherine/ana3/lib/python3.4/site-packages/pandas/core/frame.py:1706: UserWarning: Boolean Series key will be reindexed to match DataFrame index. "DataFrame index.", UserWarning)
sex | F | M | total | ratio | andro |
---|---|---|---|---|---|
name | |||||
Jessie | 6075 | 5985 | 12060 | 0.503731 | -0.003731 |
Amari | 7280 | 7571 | 14851 | 0.490203 | -0.009797 |
Peyton | 32371 | 27542 | 59913 | 0.540300 | -0.040300 |
Justice | 6332 | 7478 | 13810 | 0.458508 | -0.041492 |
Riley | 50981 | 43165 | 94146 | 0.541510 | -0.041510 |
states = pd.read_csv('sample_data/states.csv')
states
id | state | capitol | |
---|---|---|---|
0 | 1 | Ohio | Columbus |
1 | 2 | Illinois | Springfield |
2 | 3 | Minnesota | St. Paul |
3 | 4 | Ohio | Columbus |
4 | 5 | Indiana | Indianapolis |
5 | 6 | Michigan | Lansing |
6 | 7 | Ohio | Columbus |
states.drop_duplicates(['state','capitol'])
id | state | capitol | |
---|---|---|---|
0 | 1 | Ohio | Columbus |
1 | 2 | Illinois | Springfield |
2 | 3 | Minnesota | St. Paul |
4 | 5 | Indiana | Indianapolis |
5 | 6 | Michigan | Lansing |
states.drop_duplicates(['state','capitol'],take_last=True)
id | state | capitol | |
---|---|---|---|
1 | 2 | Illinois | Springfield |
2 | 3 | Minnesota | St. Paul |
4 | 5 | Indiana | Indianapolis |
5 | 6 | Michigan | Lansing |
6 | 7 | Ohio | Columbus |
%sql SELECT count(*) FROM yob2010
1 rows affected.
count |
---|
33838 |
import sqlalchemy as sa
db_engine = sa.create_engine('postgresql://:@/pgo')
allpeople = pd.read_sql_table('yob2010',
db_engine)
xmen = allpeople[allpeople.name.str.startswith('X')]
xmen.to_sql('xmen', db_engine)
%sql SELECT * FROM xmen LIMIT 5
5 rows affected.
index | name | sex | n_births |
---|---|---|---|
271 | Ximena | F | 1203 |
988 | Xiomara | F | 254 |
1404 | Xochitl | F | 162 |
1780 | Xitlali | F | 115 |
2425 | Xitlaly | F | 77 |
chris = %sql SELECT * FROM yob2010 WHERE name LIKE 'Chris%'
chris[:5]
76 rows affected.
[('Christina', 'F', 1265), ('Christine', 'F', 425), ('Christiana', 'F', 218), ('Christian', 'F', 154), ('Christy', 'F', 124)]
chrises = chris.DataFrame()
chrises.head()
name | sex | n_births | |
---|---|---|---|
0 | Christina | F | 1265 |
1 | Christine | F | 425 |
2 | Christiana | F | 218 |
3 | Christian | F | 154 |
4 | Christy | F | 124 |
%config SqlMagic.autopandas = True
Special bonus: autopandas
makes biiiiiiiiig queries safer in %sql
erics = %sql SELECT * FROM yob2010 WHERE name LIKE 'Eric%'
erics.head()
10 rows affected.
name | sex | n_births | |
---|---|---|---|
0 | Erica | F | 734 |
1 | Ericka | F | 144 |
2 | Eric | F | 7 |
3 | Eric | M | 4163 |
4 | Erick | M | 1854 |
%sql SELECT count(*) FROM erics
(ProgrammingError) relation "erics" does not exist LINE 1: SELECT count(*) FROM erics ^ 'SELECT count(*) FROM erics' {}
%sql PERSIST erics
'Persisted erics'
%sql SELECT * FROM erics LIMIT 5
5 rows affected.
index | name | sex | n_births | |
---|---|---|---|---|
0 | 0 | Erica | F | 734 |
1 | 1 | Ericka | F | 144 |
2 | 2 | Eric | F | 7 |
3 | 3 | Eric | M | 4163 |
4 | 4 | Erick | M | 1854 |
pip install ddlgenerator
pip install beautifulsoup4
cat sample_data/animals.csv
name,species,kg,notes Alfred,wart hog,22,loves turnips Gertrude,polar bear,312.7,deep thinker Emily,salamander,0.3,
!ddlgenerator postgresql sample_data/animals.csv
CREATE TABLE animals ( name VARCHAR(8) NOT NULL, species VARCHAR(10) NOT NULL, kg DECIMAL(4, 1) NOT NULL, notes VARCHAR(13) NOT NULL );
!ddlgenerator --inserts postgresql sample_data/animals.csv | psql pgo
CREATE TABLE INSERT 0 1 INSERT 0 1 INSERT 0 1
%sql SELECT * FROM animals
3 rows affected.
name | species | kg | notes | |
---|---|---|---|---|
0 | Alfred | wart hog | 22.0 | loves turnips |
1 | Gertrude | polar bear | 312.7 | deep thinker |
2 | Emily | salamander | 0.3 |
!ddlgenerator postgresql --inserts http://en.wikipedia.org/wiki/List_of_cities_in_Ohio | psql pgo > /dev/null
%sql SELECT * FROM list_of_cities_in_ohio LIMIT 5;
5 rows affected.
city | population | county | |
---|---|---|---|
0 | Akron | 199,110 | Summit County |
1 | Alliance | 22,322 | Stark County and Mahoning County |
2 | Amherst | 12,021 | Lorain County |
3 | Ashland | 20,362 | Ashland County |
4 | Ashtabula | 19,124 | Ashtabula County |
!cat sample_data/knights.yaml
- name: Lancelot dob: 9 jan 471 kg: 82 brave: y - name: Gawain kg: 69.2 brave: y - name: Robin dob: 9 jan 471 brave: n followers: - name: Minstrel 1 instrument: harp - name: Minstrel 2 instrument: lute - name: Minstrel 3 instrument: pipes - name: Reepacheep kg: 0.0691 brave: y
!ddlgenerator postgresql sample_data/knights.yaml
WARNING:root:Primary key knights.knights_id not requested, but nesting demands it CREATE TABLE knights ( name VARCHAR(10) NOT NULL, dob TIMESTAMP WITHOUT TIME ZONE, kg DECIMAL(6, 4), brave BOOLEAN NOT NULL, knights_id SERIAL NOT NULL, PRIMARY KEY (knights_id) ); CREATE TABLE followers ( name VARCHAR(10) NOT NULL, instrument VARCHAR(5) NOT NULL, knights_id INTEGER NOT NULL, FOREIGN KEY(knights_id) REFERENCES knights (knights_id) );
!ddlgenerator --inserts postgresql sample_data/knights.yaml | psql pgo
WARNING:root:Primary key knights.knights_id not requested, but nesting demands it CREATE TABLE CREATE TABLE INSERT 0 1 INSERT 0 1 INSERT 0 1 INSERT 0 1 INSERT 0 1 INSERT 0 1 INSERT 0 1
%sql select * from knights
4 rows affected.
name | dob | kg | brave | knights_id | |
---|---|---|---|---|---|
0 | Lancelot | 0471-01-09 00:00:00 | 82.0000 | True | 1 |
1 | Gawain | None | 69.2000 | True | 2 |
2 | Robin | 0471-01-09 00:00:00 | None | False | 3 |
3 | Reepacheep | None | 0.0691 | True | 4 |
%sql select * from followers
3 rows affected.
name | instrument | knights_id | |
---|---|---|---|
0 | Minstrel 1 | harp | 3 |
1 | Minstrel 2 | lute | 3 |
2 | Minstrel 3 | pipes | 3 |
sandmanctl postgresql://username:password@server/dbname
REST server (HATEOAS)
RESTful data server
Compact, rich query language
htsql-ctl serve pgsql://:@/pgo
import matplotlib
matplotlib.use('nbagg')
/home/catherine/ana3/lib/python3.4/site-packages/matplotlib/__init__.py:1256: UserWarning: This call to matplotlib.use() has no effect because the backend has already been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot, or matplotlib.backends is imported for the first time. warnings.warn(_use_error_msg)
lesleys.head()
name | sex | n_births | year | |
---|---|---|---|---|
89 | Leslie | F | 3619 | 2000 |
388 | Lesly | F | 742 | 2000 |
429 | Lesley | F | 658 | 2000 |
1313 | Lesli | F | 145 | 2000 |
2248 | Leslye | F | 70 | 2000 |
lesley_summary = lesleys.pivot_table('n_births',
rows='year',
cols='sex',
aggfunc=sum)
lesley_summary
sex | F | M |
---|---|---|
year | ||
2000 | 5324 | 170 |
2001 | 5233 | 139 |
2002 | 4994 | 114 |
2003 | 5187 | 121 |
2004 | 4970 | 106 |
2005 | 4552 | 122 |
2006 | 4412 | 94 |
2007 | 4052 | 89 |
2008 | 3509 | 81 |
2009 | 2945 | 86 |
2010 | 2410 | 53 |
lesley_summary.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7ff7b31bb128>
lesleys.pivot_table('n_births', rows='name', cols='sex',
aggfunc='sum')
sex | F | M |
---|---|---|
name | ||
Leslee | 324 | 5 |
Leslei | 31 | NaN |
Lesleigh | 61 | NaN |
Lesley | 4335 | 106 |
Lesli | 1495 | NaN |
Leslian | 17 | NaN |
Leslie | 32581 | 1014 |
Leslieann | 62 | NaN |
Leslly | 5 | NaN |
Lesly | 7953 | 50 |
Leslyann | 16 | NaN |
Leslye | 708 | NaN |
lesleys.pivot_table('n_births', rows='name', cols='sex',
aggfunc='sum'
).plot(kind='barh', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7ff7b4162748>
yob2000.head()
name | sex | n_births | |
---|---|---|---|
0 | Emily | F | 25949 |
1 | Hannah | F | 23066 |
2 | Madison | F | 19965 |
3 | Ashley | F | 17991 |
4 | Sarah | F | 17677 |
yob2000[yob2000.n_births > 1000].hist(bins=10)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7ff7b3effb00>]], dtype=object)
gsum = gender[(gender.total > 10000) & (gender.andro > -0.3)] \
.sort('andro', ascending=False)
gsum.head()
sex | F | M | total | ratio | andro |
---|---|---|---|---|---|
name | |||||
Jessie | 6075 | 5985 | 12060 | 0.503731 | -0.003731 |
Amari | 7280 | 7571 | 14851 | 0.490203 | -0.009797 |
Peyton | 32371 | 27542 | 59913 | 0.540300 | -0.040300 |
Justice | 6332 | 7478 | 13810 | 0.458508 | -0.041492 |
Riley | 50981 | 43165 | 94146 | 0.541510 | -0.041510 |
gsum['len'] = gsum.index.map(len)
gsum.plot(kind='scatter', x='len', y='ratio',
s=gsum.total/100, alpha=0.3)
<matplotlib.axes._subplots.AxesSubplot at 0x7ff7b3ea3588>
import mpld3
mpld3.enable_notebook()
gsum.plot(kind='scatter', x='len', y='ratio',
s=gsum.total/100, alpha=0.3)
<matplotlib.axes._subplots.AxesSubplot at 0x7ff7b32b2390>
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
points = ax.scatter(gsum.len, gsum.ratio,
s=gsum.total/100, alpha=0.3)
tooltip = mpld3.plugins.PointLabelTooltip(points, list(gsum.index))
mpld3.plugins.connect(fig, tooltip)
yob2000_common = yob2000[yob2000.n_births > 5000]
hacknied_gender_color_map = {'F': 'red', 'M': 'blue'}
hgcm = hacknied_gender_color_map
yob2000_common.sex.map(hgcm)
0 red 1 red 2 red 3 red 4 red 5 red 6 red 7 red 8 red 9 red 10 red 11 red 12 red 13 red 14 red ... 17715 blue 17716 blue 17717 blue 17718 blue 17719 blue 17720 blue 17721 blue 17722 blue 17723 blue 17724 blue 17725 blue 17726 blue 17727 blue 17728 blue 17729 blue Name: sex, Length: 146, dtype: object
import bokeh.plotting as bp
bp.scatter(yob2000_common.name.str.len(), yob2000_common.n_births,
color=yob2000_common.sex.map(hgcm),
fill_alpha=0.2, size=10)
<bokeh.objects.Plot at 0x7ff7b35cdc50>
bp.curplot().title = "Name length vs. number of births"
bp.show()