%matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt pd.set_option('max_columns', 50) # this would be a huge pain to load into a database mo = pd.read_csv('data/mariano-rivera.csv') mo.tail() clean = lambda s: s.replace('$', '')[:-1] if '.' in s else s.replace('$', '') # a lot going on here url = 'https://raw.github.com/gjreda/best-sandwiches/master/data/best-sandwiches-geocode.tsv' sandwiches = pd.read_table(url, sep='\t', converters={'price': lambda s: float(clean(s))}) sandwiches.head(3) gh = pd.read_json('https://api.github.com/repos/pydata/pandas/issues?per_page=3') gh[['body', 'created_at', 'title', 'url']].head(3) clip = pd.read_clipboard() clip.head() # All of this is basically the same as it would be with Postgres, MySQL, or any other database # Just pass pandas a connection object and it'll take care of the rest. from pandas.io import sql import sqlite3 conn = sqlite3.connect('data/towed.db') query = "SELECT * FROM towed" towed = sql.read_sql(query, con=conn, parse_dates={'date':'%m/%d/%Y'}) towed.head() towed[100:105] towed.info() # empty string showing up as non-null value mo.info() # note the nulls in the Awards column mo.describe() # basic stats for any numeric column towed.set_index('date', inplace=True) # SELECT * # FROM towed # WHERE date = '2014-11-04' # LIMIT 5; towed.ix['2014-11-04'] towed.ix['2014-11-04', 'make'] # get a Series back (or individual values, if unique) # SELECT * # FROM towed # WHERE date = '2014-11-04'; towed.reset_index(inplace=True) towed[towed['date'] == '2014-11-04'] towed.set_index('date', inplace=True) cols = ['make', 'color', 'state'] # just passing a list towed[cols][:5] # SELECT * # FROM towed # WHERE make = 'FORD' # ORDER BY state DESC # LIMIT 5; towed[towed['make'] == 'FORD'].sort('state', ascending=False).head() # SELECT * # FROM towed # WHERE color = 'BLK' # AND state = 'MI'; towed[(towed.color == 'BLK') & (towed.state == 'MI')] # SELECT * # FROM towed # WHERE make = 'PORS' # OR state = 'NY' # ORDER BY make, color DESC; towed[(towed.make == 'PORS') | (towed.state == 'NY')].sort(['make', 'color'], ascending=[True, False]) # not red Porshe m = (towed.style == '4D') & (towed.state == 'IL') # create a filter mask m.value_counts() # it's just a boolean series # SELECT make, model, color # FROM towed # WHERE style = '4D' AND state = 'IL # ORDER BY date # LIMIT 5; towed[m][['make', 'model', 'color']].sort().head() towed[towed.duplicated()] # this would suck in SQL ... but that's also what primary keys are for # thousands comma in tripduration field ... that's be really annoying to load into a DB ! egrep \"[0-9],[0-9]+\" data/divvy/Divvy_Trips_2013.csv | head -n 5 ! head -n 5 data/divvy/Divvy_Stations_2013.csv ! head -n 5 data/weather.csv from datetime import datetime to_datetime = lambda d: datetime.strptime(d, '%m/%d/%Y %H:%M') # if you know the datetime format, it's much, much faster to explicitly parse it # see here: https://gist.github.com/gjreda/7433f5f70299610d9b6b trips = pd.read_csv('data/divvy/Divvy_Trips_2013.csv', converters={'starttime': to_datetime,'stoptime': to_datetime}, thousands=',') stations = pd.read_csv('data/divvy/Divvy_Stations_2013.csv') trips.info() stations.info() # CREATE TABLE divvy ( # SELECT * # FROM trips # LEFT JOIN stations # ON trips.from_station_name = stations.name # ); divvy = pd.merge(trips, stations, how='left', left_on='from_station_name', right_on='name') divvy.info() # SELECT trip_id, ... divvy.name AS name_origin ... stations.name AS name_dest # FROM divvy # INNER JOIN stations # ON divvy.to_station_name = stations.name divvy = pd.merge(divvy, stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_origin', '_dest']) divvy.info() df1 = pd.DataFrame({'a': np.random.randn(3), 'b': np.square(range(100,103))}) df2 = pd.DataFrame({'a': [np.NaN, 11, 99], 'b': np.random.randn(3)}) # SELECT a, b # FROM df1 # UNION # SELECT a, b # FROM df2; # takes a list, so you can pass 2+ dataframes # note that the indexes are the same pd.concat([df1, df2]) # it's precise to the minute - want to see counts by nearest hour divvy.starttime.value_counts().order()[:3] # I don't want to just extract the hour from the datetime ... NEAREST hour from datetime import timedelta def round_time(dt, mins=60): """ Rounds a datetime object to its nearest hour (by default). (e.g. 2014-02-01 10:45:13 becomes 2014-02-01 11:00:00) """ dt += timedelta(minutes=mins/2.) dt -= timedelta(minutes=dt.minute % mins, seconds=dt.second) return dt divvy['starthour'] = divvy.starttime.apply(round_time) divvy['stophour'] = divvy.stoptime.apply(round_time) divvy[['starttime', 'starthour']].head(3) # SELECT starthour, count(1) # FROM divvy # GROUP BY starthour divvy.groupby('starthour').size() # SELECT starthour, avg(tripduration) # FROM divvy # GROUP BY starthour # ORDER BY avg(tripduration) DESC # LIMIT 5 divvy.groupby('starthour')['tripduration'].mean().order(ascending=False)[:5] divvy.groupby('usertype').agg({'birthyear': pd.Series.nunique, 'tripduration': [np.mean, np.median]}) # number of trips started by starthour divvy.groupby('starthour').size().plot(figsize=(16,8)) # distribution of tripduration divvy.tripduration.hist(figsize=(16,8), bins=1000) plt.xlim(0, 10000); duration_counts = divvy.tripduration.value_counts() duration_counts.index.name = 'seconds' duration_counts.name = 'trips' duration_counts.head() df = duration_counts.reset_index() df['minutes'] = df.seconds/60. df.set_index('minutes', inplace=True) df.sort(inplace=True) (df.trips.cumsum() / df.trips.sum()).plot(figsize=(16,8)) plt.xlim(0, 60) plt.yticks(np.arange(0, 1.1, 0.1)); plt.figure(figsize=(9, 18)) divvy.groupby('birthyear').size().order().plot(kind='barh') divvy.groupby(divvy['starttime'].apply(lambda d: d.dayofweek))['trip_id'].count().plot(kind='bar') plt.title('Divvy trips by weekday') # 0 = Monday ... plt.xlabel('Weekday') plt.ylabel('# of trips started'); divvy['startdate'] = divvy.starthour.apply(lambda d: d.date()) by_gender = divvy.groupby(['startdate', 'gender']).size() by_gender.head() by_gender.unstack(1).head() # SELECT startdate # , COUNT(IF(gender = 'Female', 1, NULL)) # , COUNT(IF(gender = 'Male', 1, NULL)) # FROM divvy # GROUP BY startdate # LIMIT 5; divvy.groupby(['startdate', 'gender']).size().unstack(1).head() by_gender.unstack(1).plot(figsize=(16,8)) divvy.groupby(['startdate', 'usertype']).size().unstack(1).plot(figsize=(16,8), subplots=True); weekdays = divvy['starttime'].apply(lambda d: d.dayofweek) hours = divvy['starttime'].apply(lambda d: d.hour) by_weekday_hour = divvy.groupby([weekdays, hours])['trip_id'].count() by_weekday_hour.index.names = ['weekday', 'hour'] # rename MultiIndex by_weekday_hour.unstack(0).plot(figsize=(16,8)) plt.title('Trips by weekday hour') plt.ylabel('# of trips started') plt.xlabel('Hour of day') plt.xticks(range(24)) plt.xlim(0, 23);