Exploring a dataset in the Notebook

Provenance of the data

Downloading and loading a dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
%cd ~/minibook/chapter2/
In [3]:
!wget https://raw.githubusercontent.com/ipython-books/minibook-2nd-data/master/nyc_taxi.zip
!unzip nyc_taxi.zip
In [4]:
%ls data
Out[4]:
nyc_data.csv  nyc_fare.csv  [...]
In [5]:
data_filename = 'data/nyc_data.csv'
fare_filename = 'data/nyc_fare.csv'
In [6]:
data = pd.read_csv(data_filename, parse_dates=['pickup_datetime',
                                               'dropoff_datetime'])
fare = pd.read_csv(fare_filename, parse_dates=['pickup_datetime'])
In [7]:
data.head(3)

Making plots with matplotlib

In [8]:
data.columns
Out[8]:
Index(['medallion',
       ...
       'pickup_datetime',
       'dropoff_datetime',
       'passenger_count',
       'trip_time_in_secs',
       'trip_distance',
       'pickup_longitude',
       'pickup_latitude',
       'dropoff_longitude',
       'dropoff_latitude'], dtype='object')
In [9]:
p_lng = data.pickup_longitude
p_lat = data.pickup_latitude
d_lng = data.dropoff_longitude
d_lat = data.dropoff_latitude
In [10]:
p_lng
Out[10]:
0        -73.955925
1        -74.005501
...
846943   -73.978477
846944   -73.987206
Name: pickup_longitude, Length: 846945, dtype: float64
In [11]:
def lat_lng_to_pixels(lat, lng):
    lat_rad = lat * np.pi / 180.0
    lat_rad = np.log(np.tan((lat_rad + np.pi / 2.0) / 2.0))
    x = 100 * (lng + 180.0) / 360.0
    y = 100 * (lat_rad - np.pi) / (2.0 * np.pi)
    return (x, y)
In [12]:
px, py = lat_lng_to_pixels(p_lat, p_lng)
In [13]:
px
Out[13]:
0         29.456688
1         29.442916
...
846943    29.450423
846944    29.447998
Name: pickup_longitude, dtype: float64
In [14]:
plt.scatter(px, py)
In [15]:
plt.figure(figsize=(8, 6))
plt.scatter(px, py, s=.1, alpha=.03)
plt.axis('equal')
plt.xlim(29.40, 29.55)
plt.ylim(-37.63, -37.54)
plt.axis('off')

Descriptive statistics with pandas and seaborn

In [16]:
px.count(), px.min(), px.max()
Out[16]:
(846945, 29.417137499999995, 29.714313055555561)
In [17]:
px.mean(), px.median(), px.std()
Out[17]:
(29.451345807768575, 29.449418333333337, 0.0097616942794720614)
In [18]:
!conda install seaborn -q -y
In [19]:
import seaborn as sns
sns.__version__
Out[19]:
'0.6.0'
In [20]:
data.trip_distance.hist(bins=np.linspace(0., 10., 100))