#!/usr/bin/env python # coding: utf-8 # # New York Subway and Weather Data # # Working with 2D data with numpy and pandas. # # ### Questions # # * What variables area related to subway ridership? # * What stations have the most riders? # * What are the ridership patterns over time? On weekends Holidays? # * How does the weather affect ridership? # * What patterns can I find in the weather? # * How does the weather vary across the city? # * How does the temperature rise throughout the month? # # In[1]: # Libraries import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns get_ipython().run_line_magic('pylab', 'inline') # ### Storing 2D Data # # * Pure python: Lists of lists # * NumPy: 2D array (more memory efficent, accessing elements differ in syntax) # * Pandas : Dataframe # # ### Storing 2D Data : NumPy # # In[2]: # Sample Numpy 2D array: Subway ridership for 5 stations and 10 days # Columns represent train stations, and rows represent days ridership = np.array([ [ 0, 0, 2, 5, 0], [1478, 3877, 3674, 2328, 2539], [1613, 4088, 3991, 6461, 2691], [1560, 3392, 3826, 4787, 2613], [1608, 4802, 3932, 4477, 2705], [1576, 3933, 3909, 4979, 2685], [ 95, 229, 255, 496, 201], [ 2, 0, 1, 27, 0], [1438, 3785, 3589, 4174, 2215], [1342, 4043, 4009, 4665, 3033] ]) # Check type of 2D array ridership.dtype # Working with Numpy 2D arrays: # # * Station with max number of riders on day 1 and the mean riders per day (mean_for_max) # * For all days and stations what is the mean ridership overall? (overall_mean) # In[3]: def mean_riders_for_max_station(ridership): # Max riders on day 1 max_station = ridership[0,:].argmax() # argmax returns position # Mean riders per day for max_station mean_for_max = ridership[:, max_station].mean() overall_mean = ridership.mean() return (overall_mean, mean_for_max) mean_riders_for_max_station(ridership) # Station with higher ridership on day1 had a higher ridership than average. This is interesting but not useful. # # ###### Use NumPy's axis function to find more practical results: # * Mean ridership per day for each subway station # * Maximum and minimum ridership for each day for each subway station # In[4]: def min_and_max_riders_per_day(ridership): # Mean per day for each station station_riders = ridership.mean(axis = 0) #selects columns max_daily_ridership = station_riders.max() min_daily_ridership = station_riders.min() return (max_daily_ridership, min_daily_ridership) min_and_max_riders_per_day(ridership) # The station with the max daily ridership had 3239 riders. # The station with the min daily ridership had 1071 riders. # ### Storing 2D Data : Pandas Dataframes # # Recreate the sample 2D numpy array using a pandas dataframe. Advantages of a pandas dataframe over a 2D NumPy array includes the following: # * Indexs for rows # * Name for columns # * Different data types for each column # In[5]: # Create a pandas df ridership_df = pd.DataFrame( data=[[ 0, 0, 2, 5, 0], [1478, 3877, 3674, 2328, 2539], [1613, 4088, 3991, 6461, 2691], [1560, 3392, 3826, 4787, 2613], [1608, 4802, 3932, 4477, 2705], [1576, 3933, 3909, 4979, 2685], [ 95, 229, 255, 496, 201], [ 2, 0, 1, 27, 0], [1438, 3785, 3589, 4174, 2215], [1342, 4043, 4009, 4665, 3033]], index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11', '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'], columns=['R003', 'R004', 'R005', 'R006', 'R007'] ) # Print df ridership_df # Mean ridership for each station (column). # In[6]: # Mean of each column ridership_df.mean() # Mean ridership for each day (row). # In[7]: # Mean of each row ridership_df.mean(axis = 1) # Accessing a station (column) of the dataframe using "iloc" # In[8]: #ridership_df.iloc['R007'] --> This is wrong print ridership_df.iloc[4] print ridership_df['R007'] # prints beginning of column # Accessing a day (row/index) of the dataframe using "loc" # In[9]: ridership_df.loc['05-09-11'] # Accessing a day/station (row/column) of the dataframe using "iloc" or "loc" # In[10]: # Two methods print ridership_df.iloc[0,3] print ridership_df.loc['05-01-11', 'R006'] # Get a 2D Numpy array from a Pandas Dataframe: # # This will output a 2D array with no rows or columns. Useful for taking the mean of all the values. Make sure data types are consistent. # In[11]: print ridership_df.values print ridership_df.values.mean() # In[12]: # Maxmimum number of riders on the first day ridership_df.iloc[0].argmax() # We rewrite the mean_riders_for_max_station function, but this time the input is a Pandas DataFrame rather than a 2D NumPy array. The main differences: # * iloc[0] instead of [0,:] to access a row # * [max_station] instead of [:, max_station] to access a column # * .values.mean() instead of .mean() to get the overall mean instead of a series of means # In[13]: def mean_riders_for_max_station(ridership): # Max riders on day 1 station max_station = ridership.iloc[0].argmax() # Mean riders per day for max station mean_for_max = ridership[max_station].mean() overall_mean = ridership.values.mean() return (overall_mean, mean_for_max) mean_riders_for_max_station(ridership_df) # Same result regardless of whether we used numpy arrays or pandas dataframe. # # Loading Data into a Dataframe # # In[14]: # Read in file subway_df = pd.read_csv('nyc_subway_weather.csv') # Instead of printing all the rows of the data, print the first 4 rows only using .head(). # In[15]: subway_df.head() # Number of rows in our dataframe: # In[16]: len(subway_df) # Use the describe() function to see some statistics about each column. # In[17]: subway_df.describe() # Linear correlation between two variables using pearson's r # # * Standardize each variable by converting it to a number of standard deviations above or below the mean. This will help scale the data. The further a data point is from the x,y axis, the larger the contribution to the pearson's r value. # * Multiply each pair of values and take the average of the products. This will split the data into four quadrants where the data is either both above/below. # * Value should be from -1 to +1 # # Note: Numpy has a corrcoef() function as awell. # In[18]: # Function to compute linear correlation between 2 variables def correlation(x, y): # input can be a numpy 2d array or panda series # Standardize data std_x = (x - x.mean()) / x.std(ddof=0) #"ddof=0" for panda's uncorrected std function std_y = (y - y.mean()) / y.std(ddof=0) return (std_x * std_y).mean() # In[19]: # Get columns of data entries = subway_df['ENTRIESn_hourly'] cum_entries = subway_df['ENTRIESn'] rain = subway_df['meanprecipi'] temp = subway_df['meantempi'] # How to interpret pearson's r results: # # * Postive "r" value. As one value increases, the other value increases: # * Both values above the mean, product is positive from (+,+) # * Both values below the mean, product is positive from (-,-) # # * Negative "r" value. As one value increases, the other value decreases: # * One value above the mean, other value below the mean, product is negative from (-, +) # * One value above the mean, other value below the mean, product is negative from (+, -) # # * If value is near 0, then poor correlation between two variables # # # Applying pearson's r to subway data: # In[20]: correlation(entries, rain) # Subway entries and rain # * small positive correlation # * more rain correlated with more subway entries # # In[21]: correlation(entries, temp) # Subway entries and temperature # * small negative correlation # * higher temperature correlated with lower subway entries # * lower temperature correlated with higher subway entries # In[22]: correlation(rain, temp) # Rain and temperature # * medium negative correlation # * higher rain correlated with lower temperature # * lower rain correlated with higher temperature # In[23]: correlation(entries, cum_entries) # Subway daily entries and cummulative subway entries # * high positive correlation # * high cummulative subway entries correlated with high daily entries # Pandas Means and Axis Names # * For axis = 0 or axis = 'index', the mean of each column is taken # * For axis = 1 or axis = 'columns', the mean of each row is taken # # # Pandas and Vectorized Operations # * Match up elements by index and columns instead of position ( as in numpy 2d arrays) # # # #### Convert Total Entries and Exits to Hourly Entries and Exits # # The subway data included the cummulative number of entries and exits by riders. We do not want the cummulative data, instead we want the hourly number of entries and exits. This can be done using two methods. # In[24]: # Sample data: DF with total entries (column1) and total exits (column2) entries_and_exits = pd.DataFrame({ 'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594, 3144808, 3144895, 3144905, 3144941, 3145094], 'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275, 1088317, 1088328, 1088331, 1088420, 1088753] }) # In[25]: # Method 1 : Write a function # Use shift(1) which shifts index by period of 1 def get_hourly_entries_and_exits(entries_and_exits): # Subtract value from each previous row return entries_and_exits - entries_and_exits.shift(1) get_hourly_entries_and_exits(entries_and_exits) # In[26]: # Method 2: Builtin .diff() function print entries_and_exits.diff() # Pandas and non built-in functions # * write a helper function # * applymap() function maps a function element to element # # For dataframes the applymap() and apply() functions do different things. # * apply(): applies helper on each column/series. # * Can get max value from each column using df.apply(np.max) # * applymap(): applies helper to each element # # Example: Get the second largest value in each column # In[27]: # Helper for single column def second_largest_in_column(column): # Sort column in descending order. Find second value. sorted_column = column.sort_values(ascending=False) return sorted_column.iloc(1) # Second value at position 1 # Apply function to a df def second_largest(df): return df.apply(second_largest_in_column) # second_largest(df) # Pandas addition of dataframes and series # # * dataframe + dataframe = dataframe # * series + series = series # * dataframe + series = ??? # # In[28]: # Series + Square dataFrame 0,1,2,3 column/index names s = pd.Series([1, 2, 3, 4]) df = pd.DataFrame({ 0: [10, 20, 30, 40], 1: [50, 60, 70, 80], 2: [90, 100, 110, 120], 3: [130, 140, 150, 160] }) # Each value of the series was added to each column of the dataframe print df + s # In[29]: # Series + one-row DataFrame s = pd.Series([1, 2, 3, 4]) df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]}) # Each value of the series was added to each column of the dataframe # Same as above print df + s # In[30]: # Series + one-column DataFrame s = pd.Series([1, 2, 3, 4]) df = pd.DataFrame({0: [10, 20, 30, 40]}) # one column # First column had 1 added, the rest were NaNs print df + s # df.add(s, axis = 'columns') df.add(s, axis = 'rows') # Each value of series added to one row # In[31]: # DataFrame column with name + Series index with name ( matching names) s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) df = pd.DataFrame({ 'a': [10, 20, 30, 40], 'b': [50, 60, 70, 80], 'c': [90, 100, 110, 120], 'd': [130, 140, 150, 160] }) print df + s # In[32]: # DataFrame column with name + Series index with name ( non- matching names) s = pd.Series([1, 2, 3, 4]) df = pd.DataFrame({ 'a': [10, 20, 30, 40], 'b': [50, 60, 70, 80], 'c': [90, 100, 110, 120], 'd': [130, 140, 150, 160] }) # No match between indexes --> NaN print df + s # Standardizing rows/columns using vector operations without apply() # # In[33]: # Example dataset using grades and names of students grades_df = pd.DataFrame( data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87], 'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]}, index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 'Fred', 'Greta', 'Humbert', 'Ivan', 'James'] ) grades_df # In[34]: grades_df.mean() # In[35]: # Subtract mean1 from column1 and mean2 from column2 # Divide each column by it's standard deviation (grades_df - grades_df.mean() )/ grades_df.std() # In[36]: # Standardize by rows, set axis = columns to get mean of each row # Subtract means from rows using sub() and axis = 'index' mean_diffs = grades_df.sub(grades_df.mean(axis='columns'), axis='index') # Divide by standard deviation - same values since only 2 columns mean_diffs.div(grades_df.std(axis='columns'), axis ='index') # Pandas Groupby() function # # General idea: Group data then aggregate data by taking an average within the group. Pandas has the following built-in functions: # * group_data() # * sum_grouped_items() # # In[37]: # Example dataframe values = np.array([1, 3, 2, 4, 1, 6, 4]) example_df = pd.DataFrame({ 'value': values, 'even': values % 2 == 0, 'above_three': values > 3 }, index=['a', 'b', 'c', 'd', 'e', 'f', 'g']) example_df # In[38]: # Groups grouped_data = example_df.groupby('even') # The groups attribute is a dictionary mapping keys to lists of row indexes print grouped_data.groups # In[39]: # Group by multiple columns grouped_data = example_df.groupby(['even', 'above_three']) print grouped_data.groups # In[40]: # Get sum of each group grouped_data = example_df.groupby('even') print grouped_data.sum() # In[41]: # Limit columns in result grouped_data = example_df.groupby('even') print grouped_data.sum()['value'] # In[42]: # Subset of columns from the grouped data before collapsing to a dataframe # Same results print grouped_data['value'].sum() # Pandas groupby() function applied to NYC subway data # # Question1: How does ridership vary by hour of the day? # * For each hour of the day, group the data # * For each hour of the day, find average ridership # # Question2 : How does rain affect subway ridership? # * Break data into two groups 1) raining 2) non-raining # * Compare ridership between two groups # # Question3: How does ridership vary by day of week? # * For each day of the week, group the data # * For each day of the week, find average ridership # Question3 Solution: How does ridership vary by day of week? # # # In[43]: # Check headers first subway_df.head() # The hourly data of the dataframe is taken in four hour intervals. # In[44]: # Question 1 ridership_by_hour = subway_df.groupby('hour').mean()['ENTRIESn_hourly'] print ridership_by_hour # In[45]: # Plot data in four hour intervals # ridership_by_hour.plot() ridership_by_hour.plot.bar() title('4-Hour Intervals vs Subway Ridership') ylabel('Number of Subway Riders') xlabel('Time (4hr intervals)') # Above is a plot of the ridership data vs four hour interval units. Highest ridership between the following intervals: # * 12pm - 4pm # * 4pm - 8pm # * 8pm - 12am # # More useful to plot hourly entries and exits for each station (UNIT) # In[66]: ex_ridership_df2 = pd.DataFrame({ 'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'], 'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'], 'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594], 'EXITSn': [1088151, 13755385, 1088159, 13755393, 1088177, 13755598, 1088231, 13756191, 1088275] }) # In[67]: # Hourly entries and exits for each station (UNIT) def hourly_for_group(entries_and_exits): return entries_and_exits - entries_and_exits.shift(1) ridership_by_station = ex_ridership_df2.groupby('UNIT')['ENTRIESn', 'EXITSn'].apply(hourly_for_group) print ridership_by_station # In[68]: # By station: # ridership_by_station.plot() ridership_by_station.plot.bar() title('Stations vs Ridership') xlabel('Station Number') ylabel('Number of Subway Riders') # Joining multiple Pandas Dataframes # # Note: this is unrelated to the NY subway dataset # # * Pandas merge() function joins multiple tables # * Types of joins: inner, left, right # # In[49]: # This is unrelated to the NY subway dataset ex_subway_df = pd.DataFrame({ 'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004', 'R004', 'R004'], 'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11', '05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'], 'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ENTRIESn': [ 4388333, 4388348, 4389885, 4391507, 4393043, 14656120, 14656174, 14660126, 14664247, 14668301], 'EXITSn': [ 2911002, 2911036, 2912127, 2913223, 2914284, 14451774, 14451851, 14454734, 14457780, 14460818], 'latitude': [ 40.689945, 40.689945, 40.689945, 40.689945, 40.689945, 40.69132 , 40.69132 , 40.69132 , 40.69132 , 40.69132 ], 'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564, -73.867135, -73.867135, -73.867135, -73.867135, -73.867135] }) ex_weather_df = pd.DataFrame({ 'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11', '05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'], 'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'latitude': [ 40.689945, 40.69132 , 40.689945, 40.69132 , 40.689945, 40.69132 , 40.689945, 40.69132 , 40.689945, 40.69132 ], 'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564, -73.867135, -73.872564, -73.867135, -73.872564, -73.867135], 'pressurei': [ 30.24, 30.24, 30.32, 30.32, 30.14, 30.14, 29.98, 29.98, 30.01, 30.01], 'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tempi': [ 52. , 52. , 48.9, 48.9, 54. , 54. , 57.2, 57.2, 48.9, 48.9], 'wspdi': [ 8.1, 8.1, 6.9, 6.9, 3.5, 3.5, 15. , 15. , 15. , 15. ] }) # In[50]: # This is unrelated to the NY subway dataset # JOIN two tables ex_subway_df.merge(ex_weather_df, on =['DATEn', 'hour', 'latitude', 'longitude'], how = 'inner' ) # Visualizing Data # * Data1: A scatterplot of subway stations with latitude and longitude as the x and y axes and ridership as the bubble size # * Data2: Histograms of subway ridership for days with rain and days without rain # * Data3: A scatterplot with subway ridership on one axis and precipitation or temperature on the other # In[51]: # Data1 : Group data by longitude and latitude. Then find the mean data_by_location = subway_df.groupby(['latitude', 'longitude']).mean() data_by_location.head() # In[52]: # Only want longitude and latitude (row indexes) --> as_index=False data_by_location = subway_df.groupby(['latitude', 'longitude'], as_index=False).mean() print data_by_location.head()['latitude'] print data_by_location.head()['longitude'] # In[53]: # Scatterplot with (x,y) as latitude,longitude and bubblesize = ridership # Scale bubble markers by dividing by std scaled_hourly_entries = (10 * data_by_location['ENTRIESn_hourly'] / data_by_location['ENTRIESn_hourly'].std() ) plt.scatter(data_by_location['latitude'], data_by_location['longitude'], s=scaled_hourly_entries ) title('Location and Ridership as Markers') xlabel('Latitude') ylabel('Longitude') # Can compare to a NYC subway map to get exact stations. High ridership in the following clusters. # * longitude from - 73.95 to -74.03 # * latitude from 40.70 to 40.79