Catherine Devlin ([email protected], @catherinedevlin)
Postgres Open 2014
All About Recent Data-Handling Developments in Python
Some Recent Data-Handling Developments in Python
A Few Recent Data-Handling Developments in Python
(aka "Project Jupyter")
Browser-based enhanced Python environment
pip install ipython_sql
pip install psycopg2
%load_ext sql
%sql postgresql://:@/pgo
%sql SELECT * FROM yob2010 LIMIT 5
%%sql
SELECT *
FROM yob2010
LIMIT 5
names = %sql SELECT * FROM yob2010
names[2]
names[2].sex
WARNINGS:
%sql
import pandas as pd
!head sample_data/yob2000.txt
yob2000 = pd.read_csv('sample_data/yob2000.txt', names=['name','sex','n_births'])
yob2000.head()
pd.read_excel
yob2000.head(6).T
yob2000.head(6).T.sum(axis=1)
ls sample_data/yob*.txt
all_names = pd.DataFrame()
for yr in range(2000, 2011):
one_yr = pd.read_csv('sample_data/yob%d.txt' % yr, names=['name','sex','n_births'])
one_yr['year'] = yr
all_names = all_names.append(one_yr, ignore_index=True)
all_names.head()
all_names.tail()
lesleys = all_names[all_names.name.str.startswith('Lesl')]
lesleys
lesleys.pivot_table('n_births', rows='year',
cols='sex', aggfunc=sum)
gender = all_names.pivot_table('n_births', rows='name',
cols='sex', aggfunc=sum)
gender
gender['total'] = gender.F + gender.M
gender['ratio'] = gender.F / gender.total
gender.head()
gender['andro'] = -1 * (abs(0.5 - gender.ratio))
gender.sort('andro', ascending=False)[
gender.total > 10000].head()
states = pd.read_csv('sample_data/states.csv')
states
states.drop_duplicates(['state','capitol'])
states.drop_duplicates(['state','capitol'],take_last=True)
%sql SELECT count(*) FROM yob2010
import sqlalchemy as sa
db_engine = sa.create_engine('postgresql://:@/pgo')
allpeople = pd.read_sql_table('yob2010',
db_engine)
xmen = allpeople[allpeople.name.str.startswith('X')]
xmen.to_sql('xmen', db_engine)
%sql SELECT * FROM xmen LIMIT 5
chris = %sql SELECT * FROM yob2010 WHERE name LIKE 'Chris%'
chris[:5]
chrises = chris.DataFrame()
chrises.head()
%config SqlMagic.autopandas = True
Special bonus: autopandas
makes biiiiiiiiig queries safer in %sql
erics = %sql SELECT * FROM yob2010 WHERE name LIKE 'Eric%'
erics.head()
%sql SELECT count(*) FROM erics
%sql PERSIST erics
%sql SELECT * FROM erics LIMIT 5
pip install ddlgenerator
pip install beautifulsoup4
cat sample_data/animals.csv
!ddlgenerator postgresql sample_data/animals.csv
!ddlgenerator --inserts postgresql sample_data/animals.csv | psql pgo
%sql SELECT * FROM animals
!ddlgenerator postgresql --inserts http://en.wikipedia.org/wiki/List_of_cities_in_Ohio | psql pgo > /dev/null
%sql SELECT * FROM list_of_cities_in_ohio LIMIT 5;
!cat sample_data/knights.yaml
!ddlgenerator postgresql sample_data/knights.yaml
!ddlgenerator --inserts postgresql sample_data/knights.yaml | psql pgo
%sql select * from knights
%sql select * from followers
sandmanctl postgresql://username:[email protected]/dbname
REST server (HATEOAS)
Compact, rich query language
htsql-ctl serve pgsql://:@/pgo
import matplotlib
matplotlib.use('nbagg')
lesleys.head()
lesley_summary = lesleys.pivot_table('n_births',
rows='year',
cols='sex',
aggfunc=sum)
lesley_summary
lesley_summary.plot()
lesleys.pivot_table('n_births', rows='name', cols='sex',
aggfunc='sum')
lesleys.pivot_table('n_births', rows='name', cols='sex',
aggfunc='sum'
).plot(kind='barh', stacked=True)
yob2000.head()
yob2000[yob2000.n_births > 1000].hist(bins=10)
gsum = gender[(gender.total > 10000) & (gender.andro > -0.3)] \
.sort('andro', ascending=False)
gsum.head()
gsum['len'] = gsum.index.map(len)
gsum.plot(kind='scatter', x='len', y='ratio',
s=gsum.total/100, alpha=0.3)
import mpld3
mpld3.enable_notebook()
gsum.plot(kind='scatter', x='len', y='ratio',
s=gsum.total/100, alpha=0.3)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
points = ax.scatter(gsum.len, gsum.ratio,
s=gsum.total/100, alpha=0.3)
tooltip = mpld3.plugins.PointLabelTooltip(points, list(gsum.index))
mpld3.plugins.connect(fig, tooltip)
yob2000_common = yob2000[yob2000.n_births > 5000]
hacknied_gender_color_map = {'F': 'red', 'M': 'blue'}
hgcm = hacknied_gender_color_map
yob2000_common.sex.map(hgcm)
import bokeh.plotting as bp
bp.scatter(yob2000_common.name.str.len(), yob2000_common.n_births,
color=yob2000_common.sex.map(hgcm),
fill_alpha=0.2, size=10)
bp.output_notebook()