Manipulating data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('data/nyc_data.csv', parse_dates=['pickup_datetime',
                                                     'dropoff_datetime'])
fare = pd.read_csv('data/nyc_fare.csv', parse_dates=['pickup_datetime'])

Selecting data

In [2]:
data[['trip_distance', 'trip_time_in_secs']].head(3)
Out[2]:
   trip_distance  trip_time_in_secs
0           0.61                300
1           3.28                960
2           1.50                386
In [3]:
data.loc[0]
Out[3]:
medallion             76942C3205E17D7E7FE5A9F709D16434
hack_license          25BA06A87905667AA1FE5990E33F0E2E
vendor_id                                          VTS
rate_code                                            1
store_and_fwd_flag                                 NaN
pickup_datetime                    2013-01-01 00:00:00
dropoff_datetime                   2013-01-01 00:05:00
passenger_count                                      3
trip_time_in_secs                                  300
trip_distance                                     0.61
pickup_longitude                             -73.95592
pickup_latitude                               40.78189
dropoff_longitude                            -73.96318
dropoff_latitude                              40.77783
Name: 0, dtype: object
In [4]:
data.loc[[0, 100000]]
In [5]:
data.loc[1000:2000:10,
         ['trip_distance', 'trip_time_in_secs']]
Out[5]:
      trip_distance  trip_time_in_secs
1000           1.00                441
1010           3.80                691
....
1990           0.13                 60
2000           9.60                963
In [6]:
data.loc[data.trip_distance>50]
In [7]:
from ipywidgets import interact
In [8]:
@interact
def show_nrows(distance_threshold=(0, 200)):
    return len(data.loc[data.trip_distance > distance_threshold])

Computing with numbers

In [9]:
data['trip_time_in_mins'] = data.trip_time_in_secs / 60.0
In [10]:
data[['trip_time_in_secs', 'trip_time_in_mins']].head(3)
Out[10]:
   trip_time_in_secs  trip_time_in_mins
0                300           5.000000
1                960          16.000000
2                386           6.433333
In [11]:
a = data.trip_distance[:5]
a
Out[11]:
0    0.61
1    3.28
2    1.50
3    0.00
4    1.31
Name: trip_distance, dtype: float64
In [12]:
b = data.trip_distance[2:6]
b
Out[12]:
2    1.50
3    0.00
4    1.31
5    5.81
Name: trip_distance, dtype: float64
In [13]:
a + b
Out[13]:
0     NaN
1     NaN
2    3.00
3    0.00
4    2.62
5     NaN
Name: trip_distance, dtype: float64

Working with text

In [14]:
data.medallion.head(3)
Out[14]:
0    76942C3205E17D7E7FE5A9F709D16434
1    517C6B330DBB3F055D007B07512628B3
2    ED15611F168E41B33619C83D900FE266
Name: medallion, dtype: object
In [15]:
data.medallion.str.slice(0, 4).head(3)
Out[15]:
0    7694
1    517C
2    ED15
Name: medallion, dtype: object

Working with dates and times

In [16]:
data.pickup_datetime.dt.dayofweek[::200000]
Out[16]:
0         1
200000    6
400000    5
600000    0
800000    1
dtype: int64
In [17]:
day_p = data.pickup_datetime.dt.day
day_d = data.dropoff_datetime.dt.day
selection = (day_p != day_d)
print(len(data.loc[selection]))
data.loc[selection].head(3)
Out[17]:
7716

Handling missing data