#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_cell_magic('HTML', '', ' \n') # In[2]: from IPython.display import Image import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) # # Data Science with Python # # Andrew Bolster # * [bolster.online](https://bolster.online) # * Tweets [@bolster](https://twitter.bolster.online) # * GitHubs at [andrewbolster](https://github.com/andrewbolster) # * Works at [Sensum Co](https://sensum.co) **We're hiring DS/DevOps** # * Plays at [Farset Labs](https://www.farsetlabs.org.uk) # * **[THIS NOTEBOOK IS AT present.bolster.online](http://present.bolster.online)** # * [Also available in source at presentgh.bolster.online](http://presentgh.bolster.online) # # # Zee Plan # # * Bit about me # * A bit about the local open data and data science ecosystem # * What I think you know # * Repeat last weeks tasks in a fraction of the time # * Throw you in the deep end with a data cleaning exercise. # # About me # # * MEng Electronics & Software Engineering (QUB) # * PhD Autonomous Systems (DSTL/MoD/NATO/UoL) # * Charity Founder / Director (Farset Labs) # * Open Data Activist (DF Open Data Advisory Panellist) # * STEM Educator (CoderDojo, CodeCoop, STEMNET) # * Data Scientist (Sensum/Free time) # # # Data Science in Northern Ireland # # Data Science: "Turning Multi-modal data into actionable insights" # # AKA: "Turning Numbers into Other Numbers, and occasionally graphs" # ![](dozens.gif) # ## Vibrant Corporate Ecosystem # # * [Analytics Engines](http://www.analyticsengines.com/) # * [AquaQ Analytics](https://www.aquaq.co.uk/) # * [Kainos](https://www.kainos.com/) # * [Flexera|BDNA](https://www.bdna.com/category/flexera/) # * [First Derivitives](https://www.firstderivatives.com/) # * [Neueda](http://www.neueda.com/) # * [Sensum](https://sensum.co/) # * [BrainwaveBank](https://www.brainwavebank.com/) # * Many Many _Many_ More # ## Open Source / Meetup Ecosystem too # ***Hint: If you really want to learn data science, go to/get involved in some of these*** # # * [IoT Belfast](https://www.meetup.com/IOTBelfast/) - _TOMORROW_ # * [IoT Alliance](https://www.meetup.com/Belfast-New-Technology-Meetup/) # * [PyBelfast](https://www.meetup.com/PyBelfast/) # * [Code Co-op NI](https://www.meetup.com/CodeCoop-NI/) # * [Data Art Belfast](https://www.meetup.com/Data-Art-Belfast/) # * [Big Data Belfast Breakout](https://www.meetup.com/Big-Data-Belfast-Breakout/) # * [Women Who Code Belfast](https://www.meetup.com/Women-Who-Code-Belfast/) # * [Docker Belfast](https://www.meetup.com/Docker-Belfast/) # * [DevOps Belfast](https://www.meetup.com/DevOps-Belfast/) # * [DevSecOps Belfast](https://www.meetup.com/devsecops-belfast/) # # ***Hint the Second: These are the _best_ ways to get a job in the field*** # ## Engaged local Government/ComVol support # # * [OpenData NI](https://www.opendatani.gov.uk/) # * [DetailData](http://data.nicva.org/) # # # ## Other People Actually Know about what we're doing in NI # # * [Big Data Belfast Conference](http://www.bigdatabelfast.com/) # * [ODCamp](http://odcamp.co.uk/) # ![](careers_1.jpg) # ![](fhrs.jpg) # # Enough Waffle, show me the numbers! # ## What I think you know # # * CSV / JSON / XML File Structures # * Python Standard Library # * Open Data NI # * Stack Overflow # ## Setup/Requirements # ```bash # conda install pandas numpy geopandas shapely # pip install ckanapi # ``` # # I would advise you grab this notebook from here instead of frantically trying to keep up... # * **[present.bolster.online](http://present.bolster.online)** # * [Also available in source at presentgh.bolster.online](http://presentgh.bolster.online) # In[3]: from matplotlib import pyplot as plt plt.rcParams.update({'figure.max_open_warning': 0}) graph_figsize = (10,6) # I'm forgetful and lazy get_ipython().run_line_magic('matplotlib', 'nbagg') # Jupyter Magics! from ckanapi import RemoteCKAN # Access to ODNI import pandas as pd # Pandas shorthand import numpy as np # Maths import os # System Operations # # Python Reminders # ## Variables and Math # In[4]: variable = 5 print(variable) # Basic Math Operations # In[5]: print(variable * 5) # ## Tricky "Math" Operations # In[6]: output = variable * "na" + ", batman" # Can guess what it is yet? # In[7]: print(output) # ## Loops and Lists # Loops are good. # In[8]: for i in range(8): for j in range(2): print('na,', end='') print() print('batman!') # Lists are things # In[9]: variable = ['thing 1','thing 2'] print(variable) # Lists are things that you can loop on # In[10]: variable = ['thing 1','thing 2'] for thing in variable: print("This is "+thing) # ## Loopy Lists # You can combine lists and loops in "List Comprehensions" # In[11]: print(['This is '+thing for thing in variable]) # In[12]: my_powers = [2**i for i in range(6)] print(my_powers) # ## Files # You can do fancy things with file contexts # In[13]: with open('writer.txt', 'w') as file: file.write('Hi there from python') # In[14]: with open('writer.txt', 'r') as same_file: print(same_file.readlines()) # # Pandas # # * It's MASSIVE # * It's PAWRFUL # * It's sometimes fiddly. # # But it's awesome so lets just plod on. # # I will be moving fast so keep up and ask questions! # In[15]: # DataFrame Creation Example- Dict of Lists, colwise d = {'col1': [1,2,3,4], 'col2': [5,6,7,8]} df = pd.DataFrame(data=d) df # In[16]: # List of Tuples d = [(1, 2 ,3 ,4), (5, 6, 7, 8)] df = pd.DataFrame(data=d) df # In[17]: # Dict of Lists; rowwise d = {'row1': [1,2,3,4], 'row2': [5,6,7,8]} df = pd.DataFrame.from_dict(d, orient='index') df # In[18]: # Add Column df[4] = [10,10] df # In[19]: # Add Row df.loc['row0'] = [10,10,10,10,0] df # In[20]: # Pandas will fill the row/col if you just give it one value df['somevalue'] = 1 df # In[21]: # Transpose df.T # In[22]: # Drop Row df = df.drop('row2') df # In[23]: # Drop Column df = df.drop('somevalue', axis=1) df # In[24]: # Rename columns df.rename(columns={0:'zero'}) # In[25]: #Note: Many pandas operations only return a 'view', doesn't change df # In[26]: # Rename in place df.rename(columns={0:'zero'}, inplace=True) df # In[27]: # Create new, useful, columns df['subtotal'] = df.sum(axis=1) df # In[28]: # Make meaningless graphs! df.plot() # # And now for something familiar in an unfamiliar way # ## [The Food Premise Hygine Ratings](https://www.opendatani.gov.uk/dataset/food-premise-hygiene-ratings) # # You lot should be experts at this, so feel free to correct me when it gets boring! # In[29]: # This is some magic that you don't need to worry about, but ask me about it at the end def dataset_generator(resource_id): """A Generator that yields records from a given dataset resource id""" ua = 'ItsBolster/29.5 (+http://farsetlabs.org.uk/)' demo = RemoteCKAN('https://www.opendatani.gov.uk/', user_agent=ua) offset=0 while True: datastore_page = demo.action.datastore_search(resource_id=resource_id, offset=offset) if not datastore_page['records']: raise StopIteration for record in datastore_page['records']: yield record ## Execution is passed back to the caller here offset+=1 # In[30]: # Why Pandas is Awesome df = pd.DataFrame.from_records(dataset_generator('3d998bd3-ecbe-4087-a653-ea11448ea53f')) df.head() # ## Tasks # 1. Print a list containing all field names in the header # 2. Print a list of all postcodes # 3. Print a list of all establishment names that do not have a # recorded postcode. # 4. Print a list of all establishment names that are missing any # item of information # ### Print a list containing all field names in the header # In[31]: list(df.keys()) # ### Print a list of all postcodes # In[32]: postcodes = sorted(df['postcode'].unique()) print(postcodes) # ***Bonus Round***, top postcodes, alphabetically # In[33]: df.groupby('postcode').size().sort_values(ascending=False).head(6) # **Bonus Bonus Round**, there are many ways to do *similar* things # Top postcodes, by order of 1st appearance in dataset # In[34]: df['postcode'].value_counts(ascending=False)[:6] # ### Print a list of all establishment names that do not have a recorded postcode. # In[35]: df[df['postcode'] == '']['establishmentname'] # ### Print a list of all establishment names that are missing any item of information # # Assumptions: # * "Any" information includes optional address fields # In[36]: df[df.isin((None,'')).any(axis=1)]['establishmentname'] # Well that was a bit pointless; everything is missing something! Our assumption was wrong # # New Assumption: # * All entries should have a rating, an addressline3 and a postcode # In[37]: df[df[['postcode','rating','establishmentaddressline3']].isin((None,'')).any(axis=1)]['establishmentname'] # ## Tasks Continued # That's all very well and good, but for some of the later challenges, we need to fiddle with things # # Data Science is all about tidying stuff up. # In[38]: df.head() # In[39]: # Looking Good but that _id is a bit pointless df = df.drop('_id', axis=1) df.head() # ## Do some actual analysis # That's more like it # # Start with the average rating # In[40]: try: df['rating'].mean() except TypeError as e: print("That didn't work", e) # In[41]: # DANGER WILL ROBINSON df.dtypes # ### So what seems to be the trouble here? # In[42]: df['rating'].unique() # There are A LOT of un-rated premises... # So we shouldn't just throw them away. # In[43]: df[df['rating'].isnull()].size # ### Cast as float, Nones become Nan # (None is the absence of a value, Nan is "Not a Number") # If you really care, look up category theory # But I don't, so let's jog on. # In[44]: df['rating'] = df['rating'].astype(float) df['rating'].unique() # Now, what's the average rating? # In[45]: df['rating'].mean() # This [by default](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.mean.html) throws away `nan` values, because performing maths on 'nothing' leads to infinities, which is **A Bad Thing™** # In[46]: df['rating'].mean(skipna=False) # ### So what's the distribution of ratings? # Who knows about Histograms? # In[47]: f,ax = plt.subplots() df['rating'].plot.hist(ax=ax) # ### Getting the `size` of each rating group: # In[48]: df.groupby('rating').size() # ### Having your Pie and eating it # In[49]: f,ax = plt.subplots() df.groupby('rating').size().plot.pie(ax=ax, label='Ratings') # ## Data Cleaning # # * What's necessary and what's not? # * What's broken/wrong? # In[50]: df.head() # In[51]: # BORING df.drop('buildingid', axis=1, inplace=True) # In[52]: # REDUNDANT for c in df.keys(): print(c, c.replace('establishment','')) # In[53]: replacements = {c: c.replace('establishment','') for c in df.keys()} replacements # In[54]: df.rename(columns=replacements, inplace=True) df.head() # In[55]: # Something doesn't look right... df.dtypes # In[56]: df['inspectiondate'] = pd.to_datetime(df['inspectiondate']) df.head() # ### Calculate days since last inspection # In[57]: df['dayssinceinspection']=pd.to_datetime('now').date()- df['inspectiondate'] df[['name','inspectiondate','dayssinceinspection']].head() # In[58]: df.dayssinceinspection.describe() # In[59]: df.sort_values(by='dayssinceinspection', ascending=False).head() # In[60]: df.sort_values(by='dayssinceinspection', ascending=False).iloc[0] # In[61]: f,ax = plt.subplots(figsize=graph_figsize) df.dayssinceinspection.dt.days.plot.hist(ax=ax) # In[62]: f,ax = plt.subplots(figsize=graph_figsize) df.dayssinceinspection.dt.days.plot.hist(ax=ax) first_inspection = df.sort_values(by='dayssinceinspection', ascending=False).iloc[0] ax.vlines(first_inspection.dayssinceinspection.days, *ax.get_ylim()) # In[63]: # Data Cleaning df.groupby('name').size().sort_values(ascending=False).head() # ## Extra Time? Location Mapping # That's some awful strange looking latitude / longitude # In[64]: df.head() # # The Irish Grid # ![](grid.jpg) # ### Grid Example # [https://irish.gridreferencefinder.com/](https://irish.gridreferencefinder.com/) # ![](map.png) # In[65]: from pyproj import Proj, transform prj_wgs = Proj(proj='latlong',datum='WGS84') prj_itm = Proj(init='EPSG:29903') def reproject_itm_to_wgs(eastings, northings): long, lat = transform(prj_itm, prj_wgs, eastings, northings) return long, lat reproject_itm_to_wgs(334581,375525) # In[66]: df.rename(columns={'longitude':'eastings','latitude':'northings'}, inplace=True) df.head() # In[67]: def apply_itm_to_wgs(row): eastings, northings = row['eastings'], row['northings'] if eastings is not None and northings is not None: long,lat = transform(prj_itm, prj_wgs, eastings, northings) else: long,lat = None,None return pd.Series({'latitude':lat,'longitude':long}) lats_n_longs = df[['eastings','northings']].apply(apply_itm_to_wgs, axis=1) lats_n_longs.head(10) # In[68]: try: df = df.join(lats_n_longs) except ValueError: # Gets thrown if I accidently run this twice pass df.head() # ## Maps! # In[69]: import geopandas as gp from shapely.geometry import Point geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)] _df = df.drop(['longitude','latitude'], axis=1) crs = {'init': 'epsg:4326'} gdf=gp.GeoDataFrame(_df, crs=crs, geometry=geometry) gdf.plot(figsize=graph_figsize) # ## Get Open Data Shape File to give a meaningful map of NI # [Search for "Shapefile" on OpendataNI](https://www.opendatani.gov.uk/dataset/osni-open-data-largescale-boundaries-ni-outline1/resource/7e384dd8-3a59-4ae9-be59-386d80fa4db3) (Also in this repo too) # In[70]: base_map=gp.GeoDataFrame.from_file("OSNI_Open_Data_Largescale_Boundaries__NI_Outline.shp") base_map.plot(figsize=graph_figsize) # In[71]: f,ax = plt.subplots(figsize=graph_figsize) base_map.plot(ax=ax, color='gray') gdf.dropna().plot(column='rating',ax=ax, alpha=0.5, legend=True, cmap='RdYlGn') # In[72]: min_max = df[['latitude','longitude']].agg(['min','max']) min_max # In[73]: f,ax = plt.subplots(figsize=graph_figsize) base_map.plot(ax=ax, color='gray') gdf.dropna().plot(column='rating',ax=ax, alpha=0.5, legend=True, cmap='RdYlGn') ax.set_xlim(min_max['longitude'].values) ax.set_ylim(min_max['latitude'].values) # In[74]: f,ax = plt.subplots(figsize=graph_figsize) base_map.plot(ax=ax, color='gray') gdf.dropna().plot(column='rating',ax=ax, alpha=0.5, legend=True, cmap='RdYlGn') ax.set_xlim(min_max['longitude'].values) ax.set_ylim(min_max['latitude'].values) ax.set_aspect('equal') # In[75]: percentiles = df[['latitude','longitude']].quantile([0.01,0.99]) percentiles # In[76]: f,ax = plt.subplots(figsize=graph_figsize) base_map.plot(ax=ax, color='gray') gdf.dropna().plot(column='rating',ax=ax, alpha=0.5, legend=True, cmap='RdYlGn') ax.set_xlim(percentiles['longitude'].values) ax.set_ylim(percentiles['latitude'].values) ax.set_aspect('equal') # # Challenges / Remainder of Workshop / "Homework" # * Find out what the outlier location was using whatever method you like # * Look through [OpenDataNI](https://www.opendatani.gov.uk/dataset) Datasets # * Pick one to analyse # * (If it's crap, pick another one) # * Make One Graph that combines two measurements (i.e. position + rating as before, or rating + last-inspection) # * Check out the [dataset suggestions currently open](https://www.opendatani.gov.uk/datarequest?state=open) # * Find one you would be interested in analysing, and comment on what you think you could do with it or how you would combine it with other datasets # # **Bonus Round** # If there's something you think government *has* that you *want* and haven't *seen*, request it.