Exploring a dataset in the Notebook¶

Provenance of the data¶

Downloading and loading a dataset¶

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:

%cd ~/minibook/chapter2/

In [3]:

!wget https://raw.githubusercontent.com/ipython-books/minibook-2nd-data/master/nyc_taxi.zip
!unzip nyc_taxi.zip

In [4]:

%ls data

Out[4]:

nyc_data.csv  nyc_fare.csv  [...]

In [5]:

data_filename = 'data/nyc_data.csv'
fare_filename = 'data/nyc_fare.csv'

In [6]:

data = pd.read_csv(data_filename, parse_dates=['pickup_datetime',
                                               'dropoff_datetime'])
fare = pd.read_csv(fare_filename, parse_dates=['pickup_datetime'])

In [7]:

data.head(3)

Making plots with matplotlib¶

In [8]:

data.columns

Out[8]:

Index(['medallion',
       ...
       'pickup_datetime',
       'dropoff_datetime',
       'passenger_count',
       'trip_time_in_secs',
       'trip_distance',
       'pickup_longitude',
       'pickup_latitude',
       'dropoff_longitude',
       'dropoff_latitude'], dtype='object')

In [9]:

p_lng = data.pickup_longitude
p_lat = data.pickup_latitude
d_lng = data.dropoff_longitude
d_lat = data.dropoff_latitude

In [10]:

p_lng

Out[10]:

0        -73.955925
1        -74.005501
...
846943   -73.978477
846944   -73.987206
Name: pickup_longitude, Length: 846945, dtype: float64

In [11]:

def lat_lng_to_pixels(lat, lng):
    lat_rad = lat * np.pi / 180.0
    lat_rad = np.log(np.tan((lat_rad + np.pi / 2.0) / 2.0))
    x = 100 * (lng + 180.0) / 360.0
    y = 100 * (lat_rad - np.pi) / (2.0 * np.pi)
    return (x, y)

In [12]:

px, py = lat_lng_to_pixels(p_lat, p_lng)

In [13]:

px

Out[13]:

0         29.456688
1         29.442916
...
846943    29.450423
846944    29.447998
Name: pickup_longitude, dtype: float64

In [14]:

plt.scatter(px, py)

In [15]:

plt.figure(figsize=(8, 6))
plt.scatter(px, py, s=.1, alpha=.03)
plt.axis('equal')
plt.xlim(29.40, 29.55)
plt.ylim(-37.63, -37.54)
plt.axis('off')

Descriptive statistics with pandas and seaborn¶

In [16]:

px.count(), px.min(), px.max()

Out[16]:

(846945, 29.417137499999995, 29.714313055555561)

In [17]:

px.mean(), px.median(), px.std()

Out[17]:

(29.451345807768575, 29.449418333333337, 0.0097616942794720614)

In [18]:

!conda install seaborn -q -y

In [19]:

import seaborn as sns
sns.__version__

Out[19]:

'0.6.0'

In [20]:

data.trip_distance.hist(bins=np.linspace(0., 10., 100))