%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)

# this would be a huge pain to load into a database
mo = pd.read_csv('data/mariano-rivera.csv')
mo.tail()

clean = lambda s: s.replace('$', '')[:-1] if '.' in s else s.replace('$', '') # a lot going on here
url = 'https://raw.github.com/gjreda/best-sandwiches/master/data/best-sandwiches-geocode.tsv'
sandwiches = pd.read_table(url, sep='\t', converters={'price': lambda s: float(clean(s))})
sandwiches.head(3)

gh = pd.read_json('https://api.github.com/repos/pydata/pandas/issues?per_page=3')
gh[['body', 'created_at', 'title', 'url']].head(3)

clip = pd.read_clipboard()
clip.head()

# All of this is basically the same as it would be with Postgres, MySQL, or any other database
# Just pass pandas a connection object and it'll take care of the rest.
from pandas.io import sql
import sqlite3
conn = sqlite3.connect('data/towed.db')
query = "SELECT * FROM towed"
towed = sql.read_sql(query, con=conn, parse_dates={'date':'%m/%d/%Y'})
towed.head()

towed[100:105]

towed.info() # empty string showing up as non-null value

mo.info() # note the nulls in the Awards column

mo.describe() # basic stats for any numeric column

towed.set_index('date', inplace=True)

# SELECT *
# FROM towed
# WHERE date = '2014-11-04'
# LIMIT 5;
towed.ix['2014-11-04']

towed.ix['2014-11-04', 'make'] # get a Series back (or individual values, if unique)

# SELECT *
# FROM towed
# WHERE date = '2014-11-04';
towed.reset_index(inplace=True)
towed[towed['date'] == '2014-11-04']

towed.set_index('date', inplace=True)

cols = ['make', 'color', 'state'] # just passing a list
towed[cols][:5]

# SELECT *
# FROM towed
# WHERE make = 'FORD'
# ORDER BY state DESC
# LIMIT 5;
towed[towed['make'] == 'FORD'].sort('state', ascending=False).head()

# SELECT *
# FROM towed
# WHERE color = 'BLK'
#   AND state = 'MI';
towed[(towed.color == 'BLK') & (towed.state == 'MI')]

# SELECT *
# FROM towed
# WHERE make = 'PORS'
#   OR state = 'NY'
# ORDER BY make, color DESC;
towed[(towed.make == 'PORS') | (towed.state == 'NY')].sort(['make', 'color'], ascending=[True, False]) # not red Porshe

m = (towed.style == '4D') & (towed.state == 'IL') # create a filter mask

m.value_counts() # it's just a boolean series

# SELECT make, model, color
# FROM towed
# WHERE style = '4D' AND state = 'IL
# ORDER BY date
# LIMIT 5;
towed[m][['make', 'model', 'color']].sort().head()

towed[towed.duplicated()] # this would suck in SQL ... but that's also what primary keys are for

# thousands comma in tripduration field ... that's be really annoying to load into a DB
! egrep \"[0-9],[0-9]+\" data/divvy/Divvy_Trips_2013.csv | head -n 5

! head -n 5 data/divvy/Divvy_Stations_2013.csv

! head -n 5 data/weather.csv

from datetime import datetime
to_datetime = lambda d: datetime.strptime(d, '%m/%d/%Y %H:%M')

# if you know the datetime format, it's much, much faster to explicitly parse it
# see here: https://gist.github.com/gjreda/7433f5f70299610d9b6b
trips = pd.read_csv('data/divvy/Divvy_Trips_2013.csv',
                    converters={'starttime': to_datetime,'stoptime': to_datetime},
                    thousands=',')
stations = pd.read_csv('data/divvy/Divvy_Stations_2013.csv')

trips.info()

stations.info()

# CREATE TABLE divvy (
#   SELECT *
#   FROM trips
#   LEFT JOIN stations
#     ON trips.from_station_name = stations.name
# );
divvy = pd.merge(trips, stations, how='left', left_on='from_station_name', right_on='name')
divvy.info()

# SELECT trip_id, ... divvy.name AS name_origin ... stations.name AS name_dest
# FROM divvy
# INNER JOIN stations
#   ON divvy.to_station_name = stations.name
divvy = pd.merge(divvy, stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_origin', '_dest'])
divvy.info()

df1 = pd.DataFrame({'a': np.random.randn(3), 'b': np.square(range(100,103))})
df2 = pd.DataFrame({'a': [np.NaN, 11, 99], 'b': np.random.randn(3)})

# SELECT a, b
# FROM df1
# UNION
# SELECT a, b
# FROM df2;

# takes a list, so you can pass 2+ dataframes
# note that the indexes are the same
pd.concat([df1, df2])

# it's precise to the minute - want to see counts by nearest hour
divvy.starttime.value_counts().order()[:3]

# I don't want to just extract the hour from the datetime ... NEAREST hour
from datetime import timedelta

def round_time(dt, mins=60):
    """
    Rounds a datetime object to its nearest hour (by default).
    (e.g. 2014-02-01 10:45:13 becomes 2014-02-01 11:00:00)
    """
    dt += timedelta(minutes=mins/2.)
    dt -= timedelta(minutes=dt.minute % mins, seconds=dt.second)
    return dt

divvy['starthour'] = divvy.starttime.apply(round_time)
divvy['stophour'] = divvy.stoptime.apply(round_time)

divvy[['starttime', 'starthour']].head(3)

# SELECT starthour, count(1)
# FROM divvy
# GROUP BY starthour
divvy.groupby('starthour').size()

# SELECT starthour, avg(tripduration)
# FROM divvy
# GROUP BY starthour
# ORDER BY avg(tripduration) DESC
# LIMIT 5
divvy.groupby('starthour')['tripduration'].mean().order(ascending=False)[:5]

divvy.groupby('usertype').agg({'birthyear': pd.Series.nunique, 'tripduration': [np.mean, np.median]})

# number of trips started by starthour
divvy.groupby('starthour').size().plot(figsize=(16,8))

# distribution of tripduration
divvy.tripduration.hist(figsize=(16,8), bins=1000)
plt.xlim(0, 10000);

duration_counts = divvy.tripduration.value_counts()
duration_counts.index.name = 'seconds'
duration_counts.name = 'trips'
duration_counts.head()

df = duration_counts.reset_index()
df['minutes'] = df.seconds/60.
df.set_index('minutes', inplace=True)
df.sort(inplace=True)

(df.trips.cumsum() / df.trips.sum()).plot(figsize=(16,8))
plt.xlim(0, 60)
plt.yticks(np.arange(0, 1.1, 0.1));

plt.figure(figsize=(9, 18))
divvy.groupby('birthyear').size().order().plot(kind='barh')

divvy.groupby(divvy['starttime'].apply(lambda d: d.dayofweek))['trip_id'].count().plot(kind='bar')
plt.title('Divvy trips by weekday') # 0 = Monday ...
plt.xlabel('Weekday')
plt.ylabel('# of trips started');

divvy['startdate'] = divvy.starthour.apply(lambda d: d.date())
by_gender = divvy.groupby(['startdate', 'gender']).size()

by_gender.head()

by_gender.unstack(1).head()

# SELECT startdate
#    , COUNT(IF(gender = 'Female', 1, NULL))
#    , COUNT(IF(gender = 'Male', 1, NULL))
# FROM divvy
# GROUP BY startdate
# LIMIT 5;
divvy.groupby(['startdate', 'gender']).size().unstack(1).head()

by_gender.unstack(1).plot(figsize=(16,8))

divvy.groupby(['startdate', 'usertype']).size().unstack(1).plot(figsize=(16,8), subplots=True);

weekdays = divvy['starttime'].apply(lambda d: d.dayofweek)
hours = divvy['starttime'].apply(lambda d: d.hour)
by_weekday_hour = divvy.groupby([weekdays, hours])['trip_id'].count()
by_weekday_hour.index.names = ['weekday', 'hour'] # rename MultiIndex

by_weekday_hour.unstack(0).plot(figsize=(16,8))
plt.title('Trips by weekday hour')
plt.ylabel('# of trips started')
plt.xlabel('Hour of day')
plt.xticks(range(24))
plt.xlim(0, 23);