#!/usr/bin/env python # coding: utf-8 # `Indexes` can be a difficult concept to grasp at first. # I suspect this is partly becuase they're somewhat peculiar to pandas. # These aren't like the indexes put on relational database tables for performance optimizations. # Rather, they're more like the `row_labels` of an R DataFrame, but much more capable. # # `Indexes` offer # # - metadata container # - easy label-based row selection # - easy label-based alignment in operations # - label-based concatenation # # To demonstrate these, we'll first fetch some more data. # This will be weather data from sensors at a bunch of airports across the US. # See [here](https://github.com/akrherz/iem/blob/master/scripts/asos/iem_scraper_example.py) for the example scraper I based this off of. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import json import glob import datetime from io import StringIO import requests import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt sns.set_style('ticks') # States are broken into networks. The networks have a list of ids, each representing a station. # We will take that list of ids and pass them as query parameters to the URL we built up ealier. states = """AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY""".split() # IEM has Iowa AWOS sites in its own labeled network networks = ['AWOS'] + ['{}_ASOS'.format(state) for state in states] # In[2]: def get_weather(stations, start=pd.Timestamp('2014-01-01'), end=pd.Timestamp('2014-01-31')): ''' Fetch weather data from MESONet between ``start`` and ``stop``. ''' url = ("http://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?" "&data=tmpf&data=relh&data=sped&data=mslp&data=p01i&data=vsby&data=gust_mph&data=skyc1&data=skyc2&data=skyc3" "&tz=Etc/UTC&format=comma&latlon=no" "&{start:year1=%Y&month1=%m&day1=%d}" "&{end:year2=%Y&month2=%m&day2=%d}&{stations}") stations = "&".join("station=%s" % s for s in stations) weather = (pd.read_csv(url.format(start=start, end=end, stations=stations), comment="#") .rename(columns={"valid": "date"}) .rename(columns=str.strip) .assign(date=lambda df: pd.to_datetime(df['date'])) .set_index(["station", "date"]) .sort_index()) float_cols = ['tmpf', 'relh', 'sped', 'mslp', 'p01i', 'vsby', "gust_mph"] weather[float_cols] = weather[float_cols].apply(pd.to_numeric, errors="corce") return weather # In[3]: def get_ids(network): url = "http://mesonet.agron.iastate.edu/geojson/network.php?network={}" r = requests.get(url.format(network)) md = pd.io.json.json_normalize(r.json()['features']) md['network'] = network return md # Talk briefly about the gem of a method that is `json_normalize`. # In[7]: url = "http://mesonet.agron.iastate.edu/geojson/network.php?network={}" r = requests.get(url.format("AWOS")) js = r.json() # In[8]: js['features'][:2] # In[9]: pd.DataFrame(js['features']).head().to_html() # [{'state': fl, 'counties': [arr]}] # # [{'geometry': {'coordinates': [-94.2723694444, 43.0796472222], # In[49]: js['features'][0] # In[17]: js['features'] # In[37]: get_ipython().run_line_magic('pinfo', 'pd.io.json.json_normalize') # In[ ]: # In[10]: stations = pd.io.json.json_normalize(js['features']).id url = ("http://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?" "&data=tmpf&data=relh&data=sped&data=mslp&data=p01i&data=vsby&data=gust_mph&data=skyc1&data=skyc2&data=skyc3" "&tz=Etc/UTC&format=comma&latlon=no" "&{start:year1=%Y&month1=%m&day1=%d}" "&{end:year2=%Y&month2=%m&day2=%d}&{stations}") stations = "&".join("station=%s" % s for s in stations) start = pd.Timestamp('2014-01-01') end=pd.Timestamp('2014-01-31') weather = (pd.read_csv(url.format(start=start, end=end, stations=stations), comment="#")) # ```python # import os # ids = pd.concat([get_ids(network) for network in networks], ignore_index=True) # gr = ids.groupby('network') # # os.makedirs("weather", exist_ok=True) # # for i, (k, v) in enumerate(gr): # print("{}/{}".format(i, len(network)), end='\r') # weather = get_weather(v['id']) # weather.to_csv("weather/{}.csv".format(k)) # # weather = pd.concat([ # pd.read_csv(f, parse_dates='date', index_col=['station', 'date']) # for f in glob.glob('weather/*.csv')]) # # weather.to_hdf("weather.h5", "weather") # ``` # In[1]: weather = pd.read_hdf("weather.h5", "weather").sort_index() weather.head() # OK, that was a bit of work. Here's a plot to reward ourselves. # In[2]: airports = ['DSM', 'ORD', 'JFK', 'PDX'] g = sns.FacetGrid(weather.sort_index().loc[airports].reset_index(), col='station', hue='station', col_wrap=2, size=4) g.map(sns.regplot, 'sped', 'gust_mph') plt.savefig('../content/images/indexes_wind_gust_facet.png'); # In[8]: airports = ['DSM', 'ORD', 'JFK', 'PDX'] g = sns.FacetGrid(weather.sort_index().loc[airports].reset_index(), col='station', hue='station', col_wrap=2, size=4) g.map(sns.regplot, 'sped', 'gust_mph') plt.savefig('../content/images/indexes_wind_gust_facet.svg', transparent=True); # # Set Operations # Indexes are set-like (technically *multi*sets, since you can have duplicates), so they support most python `set` operations. Indexes are immutable so you won't find any of the inplace `set` operations. # One other difference is that since `Index`es are also array like, you can't use some infix operators like `-` for `difference`. If you have a numeric index it is unclear whether you intend to perform math operations or set operations. # You can use `&` for intersetion, `|` for union, and `^` for symmetric difference though, since there's no ambiguity. # # For example, lets find the set of airports that we have weather and flight information on. Since `weather` had a MultiIndex of `airport,datetime`, we'll use the `levels` attribute to get at the airport data, separate from the date data. # In[3]: # Bring in the flights data flights = pd.read_hdf('flights.h5', 'flights') weather_locs = weather.index.levels[0] # The `categories` attribute of a Categorical is an Index origin_locs = flights.origin.cat.categories dest_locs = flights.dest.cat.categories airports = weather_locs & origin_locs & dest_locs airports # In[8]: print("Weather, no flights:\n\t", weather_locs.difference(origin_locs | dest_locs), end='\n\n') print("Flights, no weather:\n\t", (origin_locs | dest_locs).difference(weather_locs), end='\n\n') print("Dropped Stations:\n\t", (origin_locs | dest_locs) ^ weather_locs) # # Flavors # # Pandas has many subclasses of the regular `Index`, each tailored to a specific kind of data. # Most of the time these will be created for you automatically, so you don't have to worry about which one to choose. # # 1. [`Index`](http://pandas.pydata.org/pandas-docs/version/0.18.0/generated/pandas.Index.html#pandas.Index) # 2. `Int64Index` # 3. `RangeIndex` (Memory-saving special case of `Int64Index`) # 4. `FloatIndex` # 5. `DatetimeIndex`: Datetime64[ns] precision data # 6. `PeriodIndex`: Regularly-spaced, arbitrary precision datetime data. # 7. `TimedeltaIndex`: Timedelta data # 8. `CategoricalIndex`: # # Some of these are purely optimizations, others use information about the data to provide additional methods. # And while sometimes you might work with indexes directly (like the set operations above), most of they time you'll be operating on a Series or DataFrame, which in turn makes use of its Index. # ### Row Slicing # We saw in part one that they're great for making *row* subsetting as easy as column subsetting. # In[9]: weather.loc['DSM'].head() # Without indexes we'd probably resort to boolean masks. # In[10]: weather2 = weather.reset_index() weather2[weather2['station'] == 'DSM'].head() # Slightly less convenient, but still doable. # ### Indexes for Easier Arithmetic, Analysis # It's nice to have your metadata (labels on each observation) next to you actual values. But if you store them in an array, they'll get in the way. Say we wanted to translate the farenheit temperature to celcius. # In[116]: # With indecies temp = weather['tmpf'] c = (temp - 32) * 5 / 9 c.to_frame() # In[13]: # without temp2 = weather.reset_index()[['station', 'date', 'tmpf']] temp2['tmpf'] = (temp2['tmpf'] - 32) * 5 / 9 temp2.head() # Again, not terrible, but not as good. # And, what if you had wanted to keep farenheit around as well, instead of overwriting it like we did? # Then you'd need to make a copy of everything, including the `station` and `date` columns. # We don't have that problem, since indexes are mutable and safely shared between DataFrames / Series. # In[14]: temp.index is c.index # ### Indexes for Alignment # # I've saved the best for last. # Automatic alignment, or reindexing, is fundamental to pandas. # # All binary operations (add, multiply, etc...) between Series/DataFrames first *align* and then proceed. # Let's suppose we have hourly observations on temperature and windspeed. # And suppose some of the observations were invalid, and not reported (simulated below by sampling from the full dataset). We'll assume the missing windspeed observations were potentially different from the missing temperature observations. # In[124]: dsm = weather.loc['DSM'] hourly = dsm.resample('H').mean() temp = hourly['tmpf'].sample(frac=.5, random_state=1).sort_index() sped = hourly['sped'].sample(frac=.5, random_state=2).sort_index() # In[125]: temp.head().to_frame() # In[126]: sped.head() # Notice that the two indexes aren't identical. # # Suppose that the `windspeed : temperature` ratio is meaningful. # When we go to compute that, pandas will automatically align the two by index label. # In[129]: sped / temp # This lets you focus on doing the operation, rather than manually aligning things, ensuring that the arrays are the same length and in the same order. # By deault, missing values are inserted where the two don't align. # You can use the method version of any binary operation to specify a `fill_value` # In[130]: sped.div(temp, fill_value=1) # And since I couldn't find anywhere else to put it, you can control the axis the operation is aligned along as well. # In[131]: hourly.div(sped, axis='index') # The non row-labeled version of this is messy. # In[132]: temp2 = temp.reset_index() sped2 = sped.reset_index() # Find rows where the operation is defined common_dates = pd.Index(temp2.date) & sped2.date pd.concat([ # concat to not lose date information sped2.loc[sped2['date'].isin(common_dates), 'date'], (sped2.loc[sped2.date.isin(common_dates), 'sped'] / temp2.loc[temp2.date.isin(common_dates), 'tmpf'])], axis=1).dropna(how='all') # Yeah, I prefer the `temp / sped` version. # # Alignment isn't limited to arithmetic operations, although those are the most obvious and easiest to demonstrate. # # Merging # # There are two ways of merging DataFrames / Series in pandas # # 1. Relational Database style with `pd.merge` # 2. Array style with `pd.concat` # # Personally, I think in terms of the `concat` style. # I learned pandas before I ever really used SQL, so it comes more naturally to me I suppose. # `pd.merge` has more flexibilty, though I think *most* of the time you don't need this flexibilty. # ### Concat Version # In[133]: pd.concat([temp, sped], axis=1).head() # The `axis` parameter controls how the data should be stacked, `0` for vertically, `1` for horizontally. # The `join` parameter controls the merge behavior on the shared axis, (the Index for `axis=1`). By default it's like a union of the two indexes, or an outer join. # In[134]: pd.concat([temp, sped], axis=1, join='inner') # ### Merge Version # # Since we're joining by index here the merge version is quite similar. # We'll see an example later of a one-to-many join where the two differ. # In[41]: pd.merge(temp.to_frame(), sped.to_frame(), left_index=True, right_index=True).head() # In[42]: pd.merge(temp.to_frame(), sped.to_frame(), left_index=True, right_index=True, how='outer').head() # Like I said, I typically prefer `concat` to `merge`. # The exception here is one-to-many type joins. Let's walk through one of those, # where we join the flight data to the weather data. # To focus just on the merge, we'll aggregate hour weather data to be daily, rather than trying to find the closest recorded weather observation to each departure (you could do that, but it's not the focus right now). We'll then join the one `(airport, date)` record to the many `(airport, date, flight)` records. # Quick tangent, to get the weather data to daily frequency, we'll need to resample (more on that in the timeseries section). The resample essentially involves breaking the recorded values into daily buckets and computing the aggregation function on each bucket. The only wrinkle is that we have to resample *by station*, so we'll use the `pd.TimeGrouper` helper. # In[43]: idx_cols = ['unique_carrier', 'origin', 'dest', 'tail_num', 'fl_num', 'fl_date'] data_cols = ['crs_dep_time', 'dep_delay', 'crs_arr_time', 'arr_delay', 'taxi_out', 'taxi_in', 'wheels_off', 'wheels_on', 'distance'] df = flights.set_index(idx_cols)[data_cols].sort_index() # In[64]: def mode(x): ''' Arbitrarily break ties. ''' return x.value_counts().index[0] aggfuncs = {'tmpf': 'mean', 'relh': 'mean', 'sped': 'mean', 'mslp': 'mean', 'p01i': 'mean', 'vsby': 'mean', 'gust_mph': 'mean', 'skyc1': mode, 'skyc2': mode, 'skyc3': mode} # TimeGrouper works on a DatetimeIndex, so we move `station` to the # columns and then groupby it as well. daily = (weather.reset_index(level="station") .groupby([pd.TimeGrouper('1d'), "station"]) .agg(aggfuncs)) daily.head() # ### The merge version # In[110]: m = pd.merge(flights, daily.reset_index().rename(columns={'date': 'fl_date', 'station': 'origin'}), on=['fl_date', 'origin']).set_index(idx_cols).sort_index() m.head() # In[135]: m.sample(n=10000).pipe((sns.jointplot, 'data'), 'sped', 'dep_delay') plt.savefig('../content/images/indexes_sped_delay_join.svg', transparent=True) # In[136]: m.groupby('skyc1').dep_delay.agg(['mean', 'count']).sort_values(by='mean') # In[137]: import statsmodels.api as sm # In[203]: mod = sm.OLS.from_formula('dep_delay ~ C(skyc1) + distance + tmpf + relh + sped + mslp', data=m) res = mod.fit() res.summary() # In[206]: fig, ax = plt.subplots() ax.scatter(res.fittedvalues, res.resid, color='k', marker='.', alpha=.25) ax.set(xlabel='Predicted', ylabel='Residual') sns.despine() plt.savefig('../content/images/indexes_resid_fit.png', transparent=True) # In[4]: weather.head() # In[77]: import numpy as np import pandas as pd def read(fp): df = (pd.read_csv(fp) .rename(columns=str.lower) .drop('unnamed: 36', axis=1) .pipe(extract_city_name) .pipe(time_to_datetime, ['dep_time', 'arr_time', 'crs_arr_time', 'crs_dep_time']) .assign(fl_date=lambda x: pd.to_datetime(x['fl_date']), dest=lambda x: pd.Categorical(x['dest']), origin=lambda x: pd.Categorical(x['origin']), tail_num=lambda x: pd.Categorical(x['tail_num']), unique_carrier=lambda x: pd.Categorical(x['unique_carrier']), cancellation_code=lambda x: pd.Categorical(x['cancellation_code']))) return df def extract_city_name(df): ''' Chicago, IL -> Chicago for origin_city_name and dest_city_name ''' cols = ['origin_city_name', 'dest_city_name'] city = df[cols].apply(lambda x: x.str.extract("(.*), \w{2}", expand=False)) df = df.copy() df[['origin_city_name', 'dest_city_name']] = city return df def time_to_datetime(df, columns): ''' Combine all time items into datetimes. 2014-01-01,0914 -> 2014-01-01 09:14:00 ''' df = df.copy() def converter(col): timepart = (col.astype(str) .str.replace('\.0$', '') # NaNs force float dtype .str.pad(4, fillchar='0')) return pd.to_datetime(df['fl_date'] + ' ' + timepart.str.slice(0, 2) + ':' + timepart.str.slice(2, 4), errors='coerce') return datetime_part df[columns] = df[columns].apply(converter) return df flights = read("878167309_T_ONTIME.csv") # In[80]: locs = weather.index.levels[0] & flights.origin.unique() # In[76]: (weather.reset_index(level='station') .query('station in @locs') .groupby(['station', pd.TimeGrouper('H')])).mean() # In[82]: df = (flights.copy()[['unique_carrier', 'tail_num', 'origin', 'dep_time']] .query('origin in @locs')) # In[ ]: # In[55]: weather.loc['DSM'] # In[49]: df = df # In[47]: dep.head() # In[39]: flights.dep_time # In[25]: flights.dep_time.unique() # In[ ]: stations # In[9]: flights.dep_time.head() # In[ ]: