%%HTML
<style>
/* style for presentation only */
.reveal .rendered_html table { font-size: 24px }
</style>
from IPython.display import Image
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
Andrew Bolster
Data Science: "Turning Multi-modal data into actionable insights"
AKA: "Turning Numbers into Other Numbers, and occasionally graphs"
*Hint: If you really want to learn data science, go to/get involved in some of these*
*Hint the Second: These are the best ways to get a job in the field*
conda install pandas numpy geopandas shapely
pip install ckanapi
I would advise you grab this notebook from here instead of frantically trying to keep up...
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
graph_figsize = (10,6) # I'm forgetful and lazy
%matplotlib nbagg
# Jupyter Magics!
from ckanapi import RemoteCKAN # Access to ODNI
import pandas as pd # Pandas shorthand
import numpy as np # Maths
import os # System Operations
variable = 5
print(variable)
5
Basic Math Operations
print(variable * 5)
25
output = variable * "na" + ", batman"
# Can guess what it is yet?
print(output)
nanananana, batman
Loops are good.
for i in range(8):
for j in range(2):
print('na,', end='')
print()
print('batman!')
na,na, na,na, na,na, na,na, na,na, na,na, na,na, na,na, batman!
Lists are things
variable = ['thing 1','thing 2']
print(variable)
['thing 1', 'thing 2']
Lists are things that you can loop on
variable = ['thing 1','thing 2']
for thing in variable:
print("This is "+thing)
This is thing 1 This is thing 2
You can combine lists and loops in "List Comprehensions"
print(['This is '+thing for thing in variable])
['This is thing 1', 'This is thing 2']
my_powers = [2**i for i in range(6)]
print(my_powers)
[1, 2, 4, 8, 16, 32]
You can do fancy things with file contexts
with open('writer.txt', 'w') as file:
file.write('Hi there from python')
with open('writer.txt', 'r') as same_file:
print(same_file.readlines())
['Hi there from python']
But it's awesome so lets just plod on.
I will be moving fast so keep up and ask questions!
# DataFrame Creation Example- Dict of Lists, colwise
d = {'col1': [1,2,3,4], 'col2': [5,6,7,8]}
df = pd.DataFrame(data=d)
df
col1 | col2 | |
---|---|---|
0 | 1 | 5 |
1 | 2 | 6 |
2 | 3 | 7 |
3 | 4 | 8 |
# List of Tuples
d = [(1, 2 ,3 ,4), (5, 6, 7, 8)]
df = pd.DataFrame(data=d)
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 1 | 2 | 3 | 4 |
1 | 5 | 6 | 7 | 8 |
# Dict of Lists; rowwise
d = {'row1': [1,2,3,4], 'row2': [5,6,7,8]}
df = pd.DataFrame.from_dict(d, orient='index')
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 |
row2 | 5 | 6 | 7 | 8 |
# Add Column
df[4] = [10,10]
df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 | 10 |
row2 | 5 | 6 | 7 | 8 | 10 |
# Add Row
df.loc['row0'] = [10,10,10,10,0]
df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 | 10 |
row2 | 5 | 6 | 7 | 8 | 10 |
row0 | 10 | 10 | 10 | 10 | 0 |
# Pandas will fill the row/col if you just give it one value
df['somevalue'] = 1
df
0 | 1 | 2 | 3 | 4 | somevalue | |
---|---|---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 | 10 | 1 |
row2 | 5 | 6 | 7 | 8 | 10 | 1 |
row0 | 10 | 10 | 10 | 10 | 0 | 1 |
# Transpose
df.T
row1 | row2 | row0 | |
---|---|---|---|
0 | 1 | 5 | 10 |
1 | 2 | 6 | 10 |
2 | 3 | 7 | 10 |
3 | 4 | 8 | 10 |
4 | 10 | 10 | 0 |
somevalue | 1 | 1 | 1 |
# Drop Row
df = df.drop('row2')
df
0 | 1 | 2 | 3 | 4 | somevalue | |
---|---|---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 | 10 | 1 |
row0 | 10 | 10 | 10 | 10 | 0 | 1 |
# Drop Column
df = df.drop('somevalue', axis=1)
df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 | 10 |
row0 | 10 | 10 | 10 | 10 | 0 |
# Rename columns
df.rename(columns={0:'zero'})
zero | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 | 10 |
row0 | 10 | 10 | 10 | 10 | 0 |
#Note: Many pandas operations only return a 'view', doesn't change
df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 | 10 |
row0 | 10 | 10 | 10 | 10 | 0 |
# Rename in place
df.rename(columns={0:'zero'}, inplace=True)
df
zero | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 | 10 |
row0 | 10 | 10 | 10 | 10 | 0 |
# Create new, useful, columns
df['subtotal'] = df.sum(axis=1)
df
zero | 1 | 2 | 3 | 4 | subtotal | |
---|---|---|---|---|---|---|
row1 | 1 | 2 | 3 | 4 | 10 | 20 |
row0 | 10 | 10 | 10 | 10 | 0 | 40 |
# Make meaningless graphs!
df.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10bd05128>
You lot should be experts at this, so feel free to correct me when it gets boring!
# This is some magic that you don't need to worry about, but ask me about it at the end
def dataset_generator(resource_id):
"""A Generator that yields records from a given dataset resource id"""
ua = 'ItsBolster/29.5 (+http://farsetlabs.org.uk/)'
demo = RemoteCKAN('https://www.opendatani.gov.uk/', user_agent=ua)
offset=0
while True:
datastore_page = demo.action.datastore_search(resource_id=resource_id, offset=offset)
if not datastore_page['records']:
raise StopIteration
for record in datastore_page['records']:
yield record ## Execution is passed back to the caller here
offset+=1
# Why Pandas is Awesome
df = pd.DataFrame.from_records(dataset_generator('3d998bd3-ecbe-4087-a653-ea11448ea53f'))
df.head()
_id | buildingid | establishmentaddressline1 | establishmentaddressline2 | establishmentaddressline3 | establishmentaddressline4 | establishmentname | inspectiondate | latitude | longitude | postcode | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 185003495 | 1 Corry Place | Belfast | Heyn Group | 2012-08-23T00:00:00 | 375525 | 334581 | BT3 9AH | 5 | ||
1 | 2 | 185746347 | Rosemary Presbyterian Church | 26-36 North Circular Road | Belfast | Rosemary Lunch Club | 2014-10-21T00:00:00 | 378213 | 332938 | BT15 5HD | 5 | |
2 | 3 | 185001807 | 37 Montgomery Street | Belfast | John Ross & Co Auctioneers | 2015-03-19T00:00:00 | 373992 | 334055 | BT1 4NX | 4 | ||
3 | 4 | 185000821 | 108 Donegall Street | Belfast | The Maverick/Boom Box | 2014-11-03T00:00:00 | 374852 | 333733 | BT1 2GX | 5 | ||
4 | 5 | 187129705 | 1 Union Street | Maverick | 2014-02-13T00:00:00 | 378435 | 281262 | BT1 2JF | 4 |
recorded postcode. 4. Print a list of all establishment names that are missing any item of information
list(df.keys())
['_id', 'buildingid', 'establishmentaddressline1', 'establishmentaddressline2', 'establishmentaddressline3', 'establishmentaddressline4', 'establishmentname', 'inspectiondate', 'latitude', 'longitude', 'postcode', 'rating']
postcodes = sorted(df['postcode'].unique())
print(postcodes)
['', 'BT1 1AA', 'BT1 1AL', 'BT1 1BL', 'BT1 1DA', 'BT1 1DD', 'BT1 1DJ', 'BT1 1DL', 'BT1 1DN', 'BT1 1EA', 'BT1 1FB', 'BT1 1FE', 'BT1 1FF', 'BT1 1FG', 'BT1 1FJ', 'BT1 1FP', 'BT1 1FY', 'BT1 1GA', 'BT1 1GB', 'BT1 1GH', 'BT1 1GJ', 'BT1 1HB', 'BT1 1HD', 'BT1 1HE', 'BT1 1HL', 'BT1 1HS', 'BT1 1HU', 'BT1 1HY', 'BT1 1JE', 'BT1 1JL', 'BT1 1JN', 'BT1 1JQ', 'BT1 1LA', 'BT1 1LE', 'BT1 1LS', 'BT1 1LT', 'BT1 1LU', 'BT1 1NA', 'BT1 1NB', 'BT1 1ND', 'BT1 1PE', 'BT1 1PG', 'BT1 1QA', 'BT1 1QB', 'BT1 1QN', 'BT1 1RB', 'BT1 1RD', 'BT1 2AA', 'BT1 2AB', 'BT1 2BD', 'BT1 2BE', 'BT1 2DX', 'BT1 2DY', 'BT1 2DZ', 'BT1 2ED', 'BT1 2FF', 'BT1 2FH', 'BT1 2FJ', 'BT1 2GP', 'BT1 2GT', 'BT1 2GW', 'BT1 2GX', 'BT1 2GY', 'BT1 2JD', 'BT1 2JF', 'BT1 2JG', 'BT1 2JH', 'BT1 2LA', 'BT1 2LB', 'BT1 2LD', 'BT1 2LH', 'BT1 2LQ', 'BT1 2LR', 'BT1 2LZ', 'BT1 2NB', 'BT1 2NJ', 'BT1 2NQ', 'BT1 3AA', 'BT1 3AF', 'BT1 3AH', 'BT1 3AJ', 'BT1 3AL', 'BT1 3BG', 'BT1 3BH', 'BT1 3BS', 'BT1 3BT', 'BT1 3BU', 'BT1 3EA', 'BT1 3EB', 'BT1 3EQ', 'BT1 3ES', 'BT1 3FG', 'BT1 3FJ', 'BT1 3GE', 'BT1 3GH', 'BT1 3GL', 'BT1 3GN', 'BT1 3HH', 'BT1 3JA', 'BT1 3JL', 'BT1 3JQ', 'BT1 3LA', 'BT1 3LG', 'BT1 3LL', 'BT1 3LP', 'BT1 3NL', 'BT1 3NQ', 'BT1 3NR', 'BT1 3PH', 'BT1 3WH', 'BT1 4DA', 'BT1 4DB', 'BT1 4DQ', 'BT1 4DR', 'BT1 4DT', 'BT1 4EA', 'BT1 4EB', 'BT1 4EF', 'BT1 4EG', 'BT1 4EH', 'BT1 4EJ', 'BT1 4FF', 'BT1 4FT', 'BT1 4GA', 'BT1 4GB', 'BT1 4GG', 'BT1 4GH', 'BT1 4GJ', 'BT1 4GQ', 'BT1 4HW', 'BT1 4JA', 'BT1 4JB', 'BT1 4JE', 'BT1 4LA', 'BT1 4LS', 'BT1 4NN', 'BT1 4NX', 'BT1 4PE', 'BT1 4QG', 'BT1 4QL', 'BT1 4QN', 'BT1 5AA', 'BT1 5AB', 'BT1 5AJ', 'BT1 5AW', 'BT1 5BA', 'BT1 5BB', 'BT1 5BN', 'BT1 5BY', 'BT1 5DA', 'BT1 5DB', 'BT1 5DG', 'BT1 5DP', 'BT1 5EA', 'BT1 5EB', 'BT1 5ED', 'BT1 5EE', 'BT1 5FE', 'BT1 5FF', 'BT1 5GB', 'BT1 5GS', 'BT1 5HB', 'BT1 5HE', 'BT1 5JA', 'BT1 5JD', 'BT1 5UB', 'BT1 6AD', 'BT1 6BS', 'BT1 6BT', 'BT1 6DL', 'BT1 6DU', 'BT1 6DY', 'BT1 6EA', 'BT1 6ED', 'BT1 6EF', 'BT1 6ET', 'BT1 6FD', 'BT1 6GB', 'BT1 6GE', 'BT1 6HT', 'BT1 6JA', 'BT1 6JH', 'BT1 6JS', 'BT1 6NB', 'BT1 6PA', 'BT1 6PF', 'BT1 6PG', 'BT1 6PW', 'BT1 IHF', 'BT10 0AA', 'BT10 0BB', 'BT10 0BD', 'BT10 0BE', 'BT10 0BG', 'BT10 0BN', 'BT10 0BW', 'BT10 0DG', 'BT10 0DJ', 'BT10 0DR', 'BT10 0GE', 'BT10 0GW', 'BT10 0GX', 'BT10 0GY', 'BT10 0HQ', 'BT10 0JA', 'BT10 0JB', 'BT10 0JH', 'BT10 0JJ', 'BT10 0JS', 'BT10 0LA', 'BT10 0LE', 'BT10 0LF', 'BT10 0LG', 'BT10 0LH', 'BT10 0LJ', 'BT10 0LN', 'BT10 0LP', 'BT10 0NB', 'BT10 0NE', 'BT10 0NF', 'BT11 8AY', 'BT11 8BA', 'BT11 8BD', 'BT11 8BG', 'BT11 8BH', 'BT11 8BL', 'BT11 8BN', 'BT11 8BU', 'BT11 8BW', 'BT11 8BX', 'BT11 8DX', 'BT11 8EF', 'BT11 8EJ', 'BT11 8EL', 'BT11 8FE', 'BT11 8GD', 'BT11 8GE', 'BT11 8GX', 'BT11 8HQ', 'BT11 8HR', 'BT11 8HT', 'BT11 8JF', 'BT11 8JP', 'BT11 8LT', 'BT11 8LU', 'BT11 8NE', 'BT11 8NF', 'BT11 8PA', 'BT11 8QB', 'BT11 8QS', 'BT11 9AB', 'BT11 9AE', 'BT11 9AF', 'BT11 9AH', 'BT11 9AJ', 'BT11 9AN', 'BT11 9AP', 'BT11 9BH', 'BT11 9BQ', 'BT11 9BT', 'BT11 9BU', 'BT11 9BW', 'BT11 9BX', 'BT11 9BY', 'BT11 9BZ', 'BT11 9DA', 'BT11 9DS', 'BT11 9DT', 'BT11 9EA', 'BT11 9EB', 'BT11 9EH', 'BT11 9EL', 'BT11 9FG', 'BT11 9FZ', 'BT11 9GE', 'BT11 9GS', 'BT11 9HF', 'BT11 9JD', 'BT11 9JP', 'BT11 9JQ', 'BT11 9JY', 'BT11 9LW', 'BT11 9NS', 'BT11 9PB', 'BT11 9PE', 'BT11 9PG', 'BT11 9PH', 'BT11 9PR', 'BT11 9QF', 'BT11 9QL', 'BT11 9QR', 'BT11 9QS', 'BT12 4AD', 'BT12 4AH', 'BT12 4AQ', 'BT12 4DH', 'BT12 4EJ', 'BT12 4GB', 'BT12 4GL', 'BT12 4GX', 'BT12 4HG', 'BT12 4HL', 'BT12 4HQ', 'BT12 4JR', 'BT12 4JT', 'BT12 4LD', 'BT12 4LL', 'BT12 4LP', 'BT12 4PD', 'BT12 4PE', 'BT12 4PP', 'BT12 4SG', 'BT12 5AB', 'BT12 5AD', 'BT12 5AH', 'BT12 5AT', 'BT12 5BJ', 'BT12 5BL', 'BT12 5EE', 'BT12 5ET', 'BT12 5EW', 'BT12 5EX', 'BT12 5EY', 'BT12 5GH', 'BT12 5HU', 'BT12 5HX', 'BT12 5JJ', 'BT12 5JL', 'BT12 5JN', 'BT12 5JS', 'BT12 5LA', 'BT12 5ND', 'BT12 5NE', 'BT12 5NS', 'BT12 6AA', 'BT12 6AD', 'BT12 6AE', 'BT12 6AF', 'BT12 6AG', 'BT12 6AH', 'BT12 6AL', 'BT12 6AU', 'BT12 6AW', 'BT12 6AX', 'BT12 6BA', 'BT12 6BZ', 'BT12 6DD', 'BT12 6DE', 'BT12 6DG', 'BT12 6DJ', 'BT12 6EA', 'BT12 6EN', 'BT12 6EU', 'BT12 6FB', 'BT12 6FD', 'BT12 6FE', 'BT12 6FF', 'BT12 6FQ', 'BT12 6FR', 'BT12 6FW', 'BT12 6FX', 'BT12 6HE', 'BT12 6HF', 'BT12 6HN', 'BT12 6HP', 'BT12 6HR', 'BT12 6HS', 'BT12 6HT', 'BT12 6HU', 'BT12 6HW', 'BT12 6JA', 'BT12 6LP', 'BT12 6LU', 'BT12 6NB', 'BT12 6NE', 'BT12 6NQ', 'BT12 6QA', 'BT12 6QB', 'BT12 6RH', 'BT12 6SJ', 'BT12 6TA', 'BT12 7AB', 'BT12 7AD', 'BT12 7AE', 'BT12 7AG', 'BT12 7AH', 'BT12 7AJ', 'BT12 7AL', 'BT12 7AQ', 'BT12 7AU', 'BT12 7AW', 'BT12 7AX', 'BT12 7BA', 'BT12 7BQ', 'BT12 7DD', 'BT12 7DG', 'BT12 7DJ', 'BT12 7DN', 'BT12 7DQ', 'BT12 7DR', 'BT12 7DU', 'BT12 7DW', 'BT12 7DX', 'BT12 7EX', 'BT12 7FN', 'BT12 7FP', 'BT12 7FQ', 'BT12 7FW', 'BT12 7FX', 'BT12 7GG', 'BT12 7HS', 'BT12 7JD', 'BT12 7JE', 'BT12 7JG', 'BT12 7JL', 'BT12 7JZ', 'BT12 7LA', 'BT12 7LS', 'BT12 7NA', 'BT12 7NB', 'BT12 7NH', 'BT12 7NN', 'BT12 7NW', 'BT12 7PD', 'BT12 7PF', 'BT12 7PG', 'BT12 7PJ', 'BT12 7PL', 'BT12 7PR', 'BT12 7PS', 'BT12 7PX', 'BT12 7QH', 'BT12 7QL', 'BT12 7QX', 'BT12 7RF', 'BT12 7RS', 'BT13 1AA', 'BT13 1AB', 'BT13 1AD', 'BT13 1AR', 'BT13 1BX', 'BT13 1DF', 'BT13 1DP', 'BT13 1DR', 'BT13 1DT', 'BT13 1ER', 'BT13 1FD', 'BT13 1FP', 'BT13 1FQ', 'BT13 1FR', 'BT13 1FT', 'BT13 1FX', 'BT13 1HW', 'BT13 1JB', 'BT13 1JJ', 'BT13 1LT', 'BT13 1LY', 'BT13 1QD', 'BT13 1QG', 'BT13 1RT', 'BT13 1RU', 'BT13 2AD', 'BT13 2BA', 'BT13 2BB', 'BT13 2BD', 'BT13 2BE', 'BT13 2BH', 'BT13 2BJ', 'BT13 2BN', 'BT13 2DE', 'BT13 2DR', 'BT13 2ES', 'BT13 2HS', 'BT13 2HT', 'BT13 2HZ', 'BT13 2JF', 'BT13 2JR', 'BT13 2QW', 'BT13 2RH', 'BT13 2RL', 'BT13 2SE', 'BT13 3AB', 'BT13 3AD', 'BT13 3AE', 'BT13 3AF', 'BT13 3AG', 'BT13 3BN', 'BT13 3BP', 'BT13 3BS', 'BT13 3BT', 'BT13 3BU', 'BT13 3DH', 'BT13 3DQ', 'BT13 3EP', 'BT13 3FR', 'BT13 3GD', 'BT13 3GG', 'BT13 3GH', 'BT13 3GQ', 'BT13 3GW', 'BT13 3HT', 'BT13 3HX', 'BT13 3JB', 'BT13 3JH', 'BT13 3JJ', 'BT13 3LA', 'BT13 3LB', 'BT13 3LD', 'BT13 3LF', 'BT13 3NE', 'BT13 3NG', 'BT13 3NH', 'BT13 3NW', 'BT13 3PQ', 'BT13 3PU', 'BT13 3PW', 'BT13 3QF', 'BT13 3QQ', 'BT13 3QX', 'BT13 3RP', 'BT13 3RQ', 'BT13 3SE', 'BT13 3SR', 'BT13 3ST', 'BT13 3TA', 'BT13 3TT', 'BT13 3TU', 'BT13 3XZ', 'BT14 6AA', 'BT14 6AD', 'BT14 6BP', 'BT14 6BS', 'BT14 6BY', 'BT14 6DW', 'BT14 6EN', 'BT14 6FW', 'BT14 6GN', 'BT14 6HH', 'BT14 6HP', 'BT14 6JH', 'BT14 6JL', 'BT14 6JR', 'BT14 6JT', 'BT14 6JU', 'BT14 6JX', 'BT14 6JZ', 'BT14 6LA', 'BT14 6LB', 'BT14 6LE', 'BT14 6LN', 'BT14 6LZ', 'BT14 6NH', 'BT14 6NN', 'BT14 6NP', 'BT14 6NQ', 'BT14 6PN', 'BT14 6PX', 'BT14 6QH', 'BT14 6QJ', 'BT14 6QP', 'BT14 6QQ', 'BT14 6QR', 'BT14 6QU', 'BT14 6QX', 'BT14 6QY', 'BT14 6QZ', 'BT14 6RB', 'BT14 6RE', 'BT14 6ST', 'BT14 7AA', 'BT14 7BW', 'BT14 7DA', 'BT14 7DY', 'BT14 7EA', 'BT14 7ED', 'BT14 7EE', 'BT14 7EJ', 'BT14 7EX', 'BT14 7FG', 'BT14 7GA', 'BT14 7GB', 'BT14 7GE', 'BT14 7GL', 'BT14 7GP', 'BT14 7HD', 'BT14 7HU', 'BT14 7HX', 'BT14 7HZ', 'BT14 7JA', 'BT14 7JB', 'BT14 7LD', 'BT14 7NA', 'BT14 7ND', 'BT14 7NT', 'BT14 7NZ', 'BT14 7PT', 'BT14 7QH', 'BT14 7QR', 'BT14 7QS', 'BT14 8AB', 'BT14 8AE', 'BT14 8AL', 'BT14 8AN', 'BT14 8AW', 'BT14 8BH', 'BT14 8BW', 'BT14 8BX', 'BT14 8DP', 'BT14 8DQ', 'BT14 8DT', 'BT14 8FG', 'BT14 8FH', 'BT14 8HD', 'BT14 8JN', 'BT14 8LT', 'BT14 8QA', 'BT14 8QU', 'BT15 1AB', 'BT15 1AL', 'BT15 1AS', 'BT15 1DY', 'BT15 1ED', 'BT15 1EQ', 'BT15 1ES', 'BT15 1EZ', 'BT15 1FT', 'BT15 1GQ', 'BT15 1HQ', 'BT15 1HR', 'BT15 1JP', 'BT15 1JQ', 'BT15 1WA', 'BT15 2AA', 'BT15 2AE', 'BT15 2AJ', 'BT15 2AN', 'BT15 2AR', 'BT15 2AY', 'BT15 2BB', 'BT15 2BE', 'BT15 2BL', 'BT15 2BN', 'BT15 2BP', 'BT15 2BW', 'BT15 2BX', 'BT15 2EL', 'BT15 2EN', 'BT15 2ET', 'BT15 2GB', 'BT15 2GJ', 'BT15 2GL', 'BT15 2GN', 'BT15 2GQ', 'BT15 2GW', 'BT15 2GY', 'BT15 2GZ', 'BT15 2HF', 'BT15 2HN', 'BT15 2HP', 'BT15 2HR', 'BT15 2PR', 'BT15 3AA', 'BT15 3AB', 'BT15 3BG', 'BT15 3BH', 'BT15 3BJ', 'BT15 3BL', 'BT15 3BP', 'BT15 3BS', 'BT15 3BU', 'BT15 3DQ', 'BT15 3DT', 'BT15 3EG', 'BT15 3GA', 'BT15 3GJ', 'BT15 3GQ', 'BT15 3HE', 'BT15 3HF', 'BT15 3JQ', 'BT15 3JW', 'BT15 3LG', 'BT15 3LH', 'BT15 3LJ', 'BT15 3NB', 'BT15 3NG', 'BT15 3PG', 'BT15 3PL', 'BT15 3PN', 'BT15 3PR', 'BT15 3PW', 'BT15 3PZ', 'BT15 3QA', 'BT15 3QG', 'BT15 3QX', 'BT15 4AE', 'BT15 4AN', 'BT15 4AP', 'BT15 4AQ', 'BT15 4AS', 'BT15 4AY', 'BT15 4BL', 'BT15 4DB', 'BT15 4DD', 'BT15 4DE', 'BT15 4DG', 'BT15 4DX', 'BT15 4DZ', 'BT15 4EF', 'BT15 4EN', 'BT15 4EP', 'BT15 4EZ', 'BT15 4FQ', 'BT15 4HD', 'BT15 4HG', 'BT15 4HL', 'BT15 4HN', 'BT15 4HS', 'BT15 4JA', 'BT15 4JU', 'BT15 4NJ', 'BT15 5AA', 'BT15 5AB', 'BT15 5AT', 'BT15 5BE', 'BT15 5BJ', 'BT15 5BP', 'BT15 5BS', 'BT15 5BU', 'BT15 5DB', 'BT15 5DW', 'BT15 5EB', 'BT15 5EP', 'BT15 5EX', 'BT15 5EY', 'BT15 5FG', 'BT15 5GA', 'BT15 5GB', 'BT15 5GE', 'BT15 5GF', 'BT15 5GP', 'BT15 5GQ', 'BT15 5GR', 'BT15 5HD', 'BT15 5JF', 'BT16 1DG', 'BT17 0AH', 'BT17 0AR', 'BT17 0AW', 'BT17 0BU', 'BT17 0DB', 'BT17 0DF', 'BT17 0DW', 'BT17 0DY', 'BT17 0FA', 'BT17 0FB', 'BT17 0GL', 'BT17 0GU', 'BT17 0GY', 'BT17 0HD', 'BT17 0HE', 'BT17 0HH', 'BT17 0HW', 'BT17 0LB', 'BT17 0LG', 'BT17 0LR', 'BT17 0LT', 'BT17 0LW', 'BT17 0NG', 'BT17 0NP', 'BT17 0PB', 'BT17 0PH', 'BT17 0PJ', 'BT17 0PL', 'BT17 0PP', 'BT17 0PQ', 'BT17 0RL', 'BT17 0RP', 'BT17 0RQ', 'BT17 0RW', 'BT17 0RX', 'BT17 0SJ', 'BT17 0TD', 'BT17 0TG', 'BT17 0TH', 'BT17 0TQ', 'BT17 0UG', 'BT17 0UN', 'BT17 0UT', 'BT17 0UW', 'BT17 0XF', 'BT17 0YU', 'BT17 9AD', 'BT17 9AE', 'BT17 9AF', 'BT17 9AN', 'BT17 9BA', 'BT17 9EH', 'BT17 9EJ', 'BT17 9EL', 'BT17 9JP', 'BT17 9JS', 'BT17 9JZ', 'BT17 9LA', 'BT17 9LB', 'BT17 9NP', 'BT17 9NS', 'BT17 9QA', 'BT17 9QH', 'BT17 9QX', 'BT17 9RR', 'BT17 9RY', 'BT17 9SB', 'BT17 ODB', 'BT17 OPB', 'BT17 ORE', 'BT17 ORL', 'BT18 9QE', 'BT18 9RR', 'BT2 7AF', 'BT2 7AP', 'BT2 7BA', 'BT2 7BB', 'BT2 7BE', 'BT2 7DB', 'BT2 7DT', 'BT2 7EJ', 'BT2 7ES', 'BT2 7ET', 'BT2 7FB', 'BT2 7FD', 'BT2 7FF', 'BT2 7GA', 'BT2 7GE', 'BT2 7GN', 'BT2 7GP', 'BT2 7GS', 'BT2 7HB', 'BT2 7HD', 'BT2 7HE', 'BT2 7HF', 'BT2 7HN', 'BT2 7HP', 'BT2 7HR', 'BT2 7JD', 'BT2 7JH', 'BT2 7JS', 'BT2 8AA', 'BT2 8AB', 'BT2 8BG', 'BT2 8BP', 'BT2 8BS', 'BT2 8ED', 'BT2 8FD', 'BT2 8FE', 'BT2 8GB', 'BT2 8GD', 'BT2 8HD', 'BT2 8HQ', 'BT2 8HS', 'BT2 8JE', 'BT2 8LA', 'BT2 8LG', 'BT2 8PB', 'BT2 8PS', 'BT27 4EW', 'BT3', 'BT3 9 JL', 'BT3 9AA', 'BT3 9AE', 'BT3 9AH', 'BT3 9AL', 'BT3 9AP', 'BT3 9AR', 'BT3 9AS', 'BT3 9BJ', 'BT3 9BP', 'BT3 9BU', 'BT3 9BW', 'BT3 9BX', 'BT3 9DH', 'BT3 9DP', 'BT3 9DT', 'BT3 9DY', 'BT3 9DZ', 'BT3 9ED', 'BT3 9EJ', 'BT3 9EP', 'BT3 9EZ', 'BT3 9HA', 'BT3 9HJ', 'BT3 9HQ', 'BT3 9HY', 'BT3 9JA', 'BT3 9JG', 'BT3 9JH', 'BT3 9JL', 'BT3 9JU', 'BT3 9LB', 'BT3 9LE', 'BT3 9LG', 'BT3 9LL', 'BT3 9LS', 'BT3 9LZ', 'BT3 9QQ', 'BT36 5DB', 'BT36 5ED', 'BT36 7DG', 'BT36 7DH', 'BT36 7DQ', 'BT36 7EN', 'BT36 7ES', 'BT36 7EU', 'BT36 7NH', 'BT36 7NU', 'BT36 7PG', 'BT36 7RW', 'BT4 1AB', 'BT4 1AD', 'BT4 1AF', 'BT4 1AG', 'BT4 1AJ', 'BT4 1AL', 'BT4 1AQ', 'BT4 1BP', 'BT4 1BT', 'BT4 1EH', 'BT4 1EN', 'BT4 1EQ', 'BT4 1EW', 'BT4 1EX', 'BT4 1FB', 'BT4 1FT', 'BT4 1HB', 'BT4 1HD', 'BT4 1HE', 'BT4 1HG', 'BT4 1HH', 'BT4 1HN', 'BT4 1JT', 'BT4 1JW', 'BT4 1JX', 'BT4 1JY', 'BT4 1NG', 'BT4 1NH', 'BT4 1NL', 'BT4 1NS', 'BT4 1NT', 'BT4 1NU', 'BT4 1NY', 'BT4 1PB', 'BT4 1PD', 'BT4 1PP', 'BT4 1PQ', 'BT4 1PU', 'BT4 1QA', 'BT4 1QQ', 'BT4 1RF', 'BT4 1RN', 'BT4 1SG', 'BT4 1SJ', 'BT4 1SU', 'BT4 2AA', 'BT4 2AB', 'BT4 2AG', 'BT4 2AH', 'BT4 2AN', 'BT4 2AS', 'BT4 2AU', 'BT4 2AW', 'BT4 2DA', 'BT4 2DT', 'BT4 2EG', 'BT4 2EX', 'BT4 2EY', 'BT4 2FE', 'BT4 2GD', 'BT4 2GP', 'BT4 2GU', 'BT4 2JP', 'BT4 2JT', 'BT4 2LR', 'BT4 2LS', 'BT4 2LY', 'BT4 2NA', 'BT4 2ND', 'BT4 2NF', 'BT4 2NL', 'BT4 2NR', 'BT4 2NW', 'BT4 2PD', 'BT4 2PW', 'BT4 2QF', 'BT4 2QQ', 'BT4 3BA', 'BT4 3BE', 'BT4 3BL', 'BT4 3BQ', 'BT4 3DA', 'BT4 3DJ', 'BT4 3DW', 'BT4 3EA', 'BT4 3EL', 'BT4 3EN', 'BT4 3EQ', 'BT4 3ET', 'BT4 3EU', 'BT4 3EX', 'BT4 3EY', 'BT4 3EZ', 'BT4 3FF', 'BT4 3FG', 'BT4 3GE', 'BT4 3HH', 'BT4 3HL', 'BT4 3HN', 'BT4 3HS', 'BT4 3HT', 'BT4 3JB', 'BT4 3JD', 'BT4 3JF', 'BT4 3JH', 'BT4 3JJ', 'BT4 3JU', 'BT4 3LD', 'BT4 3LH', 'BT4 3LJ', 'BT4 3LL', 'BT4 3LP', 'BT4 3LS', 'BT4 3NL', 'BT4 3NU', 'BT4 3SR', 'BT4 3SS', 'BT4 3ST', 'BT4 3SX', 'BT4 3SY', 'BT4 3TA', 'BT4 3TT', 'BT5', 'BT5 4AE', 'BT5 4AF', 'BT5 4AS', 'BT5 4EQ', 'BT5 4EW', 'BT5 4FE', 'BT5 4FG', 'BT5 4FT', 'BT5 4GJ', 'BT5 4GS', 'BT5 4GU', 'BT5 4GW', 'BT5 4GX', 'BT5 4HY', 'BT5 4JE', 'BT5 4LA', 'BT5 4LP', 'BT5 4NA', 'BT5 4ND', 'BT5 4NE', 'BT5 4NF', 'BT5 4NJ', 'BT5 4NL', 'BT5 4NS', 'BT5 4PS', 'BT5 4PY', 'BT5 4QA', 'BT5 4QG', 'BT5 4RR', 'BT5 4RS', 'BT5 4RZ', 'BT5 4SF', 'BT5 4SJ', 'BT5 5AA', 'BT5 5AB', 'BT5 5AD', 'BT5 5AW', 'BT5 5BA', 'BT5 5DL', 'BT5 5DS', 'BT5 5DT', 'BT5 5DW', 'BT5 5DX', 'BT5 5DY', 'BT5 5EG', 'BT5 5FB', 'BT5 5FE', 'BT5 5FH', 'BT5 5FL', 'BT5 5FP', 'BT5 5FR', 'BT5 5FS', 'BT5 5FT', 'BT5 5GJ', 'BT5 5GT', 'BT5 5HH', 'BT5 5HW', 'BT5 5JB', 'BT5 5JH', 'BT5 5JQ', 'BT5 5LP', 'BT5 5LS', 'BT5 5LT', 'BT5 5NF', 'BT5 5NG', 'BT5 5NH', 'BT5 5NX', 'BT5 5PB', 'BT5 5PE', 'BT5 6AB', 'BT5 6BB', 'BT5 6BH', 'BT5 6BW', 'BT5 6DA', 'BT5 6DF', 'BT5 6DS', 'BT5 6EB', 'BT5 6ED', 'BT5 6EH', 'BT5 6FR', 'BT5 6FY', 'BT5 6GA', 'BT5 6GR', 'BT5 6HJ', 'BT5 6HW', 'BT5 6JF', 'BT5 6JH', 'BT5 6JJ', 'BT5 6LA', 'BT5 6NF', 'BT5 6NU', 'BT5 6NY', 'BT5 6PL', 'BT5 6QD', 'BT5 6QE', 'BT5 6QP', 'BT5 6QR', 'BT5 6SL', 'BT5 6SR', 'BT5 7AH', 'BT5 7BS', 'BT5 7BX', 'BT5 7DG', 'BT5 7DL', 'BT5 7EA', 'BT5 7EH', 'BT5 7EJ', 'BT5 7EL', 'BT5 7EP', 'BT5 7FE', 'BT5 7FF', 'BT5 7GE', 'BT5 7GF', 'BT5 7GU', 'BT5 7GY', 'BT5 7HD', 'BT5 7JH', 'BT5 7JU', 'BT5 7JX', 'BT5 7LA', 'BT5 7LH', 'BT5 7LL', 'BT5 7LT', 'BT5 7LY', 'BT5 7NU', 'BT5 7PT', 'BT5 7PX', 'BT5 7QL', 'BT5 7QQ', 'BT5 7QZ', 'BT6 0AG', 'BT6 0BU', 'BT6 0BW', 'BT6 0BY', 'BT6 0BZ', 'BT6 0DE', 'BT6 0DG', 'BT6 0DL', 'BT6 0DN', 'BT6 0DT', 'BT6 0EX', 'BT6 0FL', 'BT6 0FR', 'BT6 0FS', 'BT6 0GB', 'BT6 0HB', 'BT6 0JA', 'BT6 0JB', 'BT6 0JJ', 'BT6 0JN', 'BT6 0JS', 'BT6 0LB', 'BT6 0LR', 'BT6 0LT', 'BT6 0LX', 'BT6 8AB', 'BT6 8AW', 'BT6 8BG', 'BT6 8DD', 'BT6 8DP', 'BT6 8DQ', 'BT6 8DR', 'BT6 8EE', 'BT6 8FB', 'BT6 8FE', 'BT6 8GH', 'BT6 8GJ', 'BT6 8GL', 'BT6 8HN', 'BT6 8HR', 'BT6 8JG', 'BT6 8LF', 'BT6 8ND', 'BT6 8PQ', 'BT6 8PR', 'BT6 8PT', 'BT6 8PU', 'BT6 8PW', 'BT6 8PX', 'BT6 8PY', 'BT6 8PZ', 'BT6 8RA', 'BT6 9AG', 'BT6 9DD', 'BT6 9DL', 'BT6 9DP', 'BT6 9DQ', 'BT6 9DR', 'BT6 9EP', 'BT6 9EQ', 'BT6 9ER', 'BT6 9ES', 'BT6 9ET', 'BT6 9EY', 'BT6 9FB', 'BT6 9FH', 'BT6 9GB', 'BT6 9GL', 'BT6 9GR', 'BT6 9HB', 'BT6 9HG', 'BT6 9HL', 'BT6 9HQ', 'BT6 9JA', 'BT6 9JD', 'BT6 9JY', 'BT6 9LS', 'BT6 9NS', 'BT6 9NT', 'BT6 9NX', 'BT6 9PR', 'BT6 9QB', 'BT6 9QL', 'BT6 9RA', 'BT6 9RB', 'BT6 9RZ', 'BT6 9SA', 'BT6 OFL', 'BT6 OLR', 'BT6 OLT', 'BT7 1AB', 'BT7 1AQ', 'BT7 1BS', 'BT7 1BU', 'BT7 1BX', 'BT7 1DQ', 'BT7 1DR', 'BT7 1DS', 'BT7 1DT', 'BT7 1EB', 'BT7 1FY', 'BT7 1GH', 'BT7 1GQ', 'BT7 1GX', 'BT7 1GY', 'BT7 1HL', 'BT7 1HP', 'BT7 1HU', 'BT7 1JG', 'BT7 1JJ', 'BT7 1JL', 'BT7 1JN', 'BT7 1JQ', 'BT7 1JR', 'BT7 1JT', 'BT7 1JW', 'BT7 1LP', 'BT7 1LW', 'BT7 1NA', 'BT7 1ND', 'BT7 1NF', 'BT7 1NG', 'BT7 1NH', 'BT7 1NJ', 'BT7 1NN', 'BT7 1NR', 'BT7 1PS', 'BT7 1QA', 'BT7 1QB', 'BT7 1QY', 'BT7 1RQ', 'BT7 1RR', 'BT7 1RS', 'BT7 1RT', 'BT7 1RU', 'BT7 1SH', 'BT7 1SL', 'BT7 1SQ', 'BT7 2AA', 'BT7 2AN', 'BT7 2BJ', 'BT7 2EB', 'BT7 2ED', 'BT7 2EP', 'BT7 2FW', 'BT7 2FX', 'BT7 2FZ', 'BT7 2GB', 'BT7 2GD', 'BT7 2GE', 'BT7 2GF', 'BT7 2GN', 'BT7 2GY', 'BT7 2HP', 'BT7 2JA', 'BT7 2JB', 'BT7 2JD', 'BT7 3AE', 'BT7 3BS', 'BT7 3DB', 'BT7 3ED', 'BT7 3EG', 'BT7 3EN', 'BT7 3ES', 'BT7 3FF', 'BT7 3FP', 'BT7 3GG', 'BT7 3GL', 'BT7 3GP', 'BT7 3GQ', 'BT7 3GR', 'BT7 3GX', 'BT7 3HB', 'BT7 3HE', 'BT7 3HG', 'BT7 3HL', 'BT7 3HX', 'BT7 3HY', 'BT7 3JB', 'BT7 3JD', 'BT7 3JH', 'BT7 3JJ', 'BT7 3LA', 'BT7 3LW', 'BT8 4DT', 'BT8 4SW', 'BT8 7AN', 'BT8 7DL', 'BT8 7EG', 'BT8 7EY', 'BT8 7PX', 'BT8 7QE', 'BT8 7SP', 'BT8 7SW', 'BT8 7XP', 'BT8 7XT', 'BT9 5AA', 'BT9 5AB', 'BT9 5AD', 'BT9 5AE', 'BT9 5AF', 'BT9 5AG', 'BT9 5AH', 'BT9 5AJ', 'BT9 5BJ', 'BT9 5BN', 'BT9 5BQ', 'BT9 5BU', 'BT9 5BW', 'BT9 5BY', 'BT9 5DY', 'BT9 5EA', 'BT9 5EH', 'BT9 5EL', 'BT9 5EX', 'BT9 5FB', 'BT9 5FH', 'BT9 5FJ', 'BT9 5FL', 'BT9 5JH', 'BT9 5JQ', 'BT9 5JX', 'BT9 5JY', 'BT9 5LH', 'BT9 5NA', 'BT9 5NW', 'BT9 5PB', 'BT9 5PH', 'BT9 5PX', 'BT9 5UB', 'BT9 6AA', 'BT9 6AD', 'BT9 6AF', 'BT9 6AG', 'BT9 6AH', 'BT9 6AJ', 'BT9 6AL', 'BT9 6AX', 'BT9 6AY', 'BT9 6BQ', 'BT9 6BY', 'BT9 6DP', 'BT9 6DW', 'BT9 6DX', 'BT9 6DY', 'BT9 6EE', 'BT9 6EJ', 'BT9 6EL', 'BT9 6EU', 'BT9 6EW', 'BT9 6FL', 'BT9 6FP', 'BT9 6FR', 'BT9 6FX', 'BT9 6GD', 'BT9 6GH', 'BT9 6GJ', 'BT9 6GL', 'BT9 6GN', 'BT9 6GU', 'BT9 6GX', 'BT9 6HG', 'BT9 6HL', 'BT9 6HT', 'BT9 6HW', 'BT9 6JP', 'BT9 6JR', 'BT9 6LD', 'BT9 6LN', 'BT9 6LY', 'BT9 6NH', 'BT9 6QR', 'BT9 6RT', 'BT9 6RU', 'BT9 6RW', 'BT9 6SB', 'BT9 6SJ', 'BT9 6TQ', 'BT9 6TT', 'BT9 6TX', 'BT9 6TY', 'BT9 7AB', 'BT9 7AE', 'BT9 7AG', 'BT9 7AJ', 'BT9 7AQ', 'BT9 7AT', 'BT9 7BJ', 'BT9 7BL', 'BT9 7BW', 'BT9 7DS', 'BT9 7DW', 'BT9 7EJ', 'BT9 7EN', 'BT9 7EP', 'BT9 7ET', 'BT9 7EW', 'BT9 7EY', 'BT9 7EZ', 'BT9 7FR', 'BT9 7GQ', 'BT9 7GS', 'BT9 7GT', 'BT9 7GU', 'BT9 7GX', 'BT9 7GY', 'BT9 7HN', 'BT9 7JA', 'BT9 7JB', 'BT9 7JE', 'BT9 7JG']
*Bonus Round*, top postcodes, alphabetically
df.groupby('postcode').size().sort_values(ascending=False).head(6)
postcode BT1 3NQ 86 33 BT1 4QG 23 BT15 1WA 19 BT1 1HL 18 BT1 1DD 18 dtype: int64
Bonus Bonus Round, there are many ways to do similar things Top postcodes, by order of 1st appearance in dataset
df['postcode'].value_counts(ascending=False)[:6]
BT1 3NQ 86 33 BT1 4QG 23 BT15 1WA 19 BT12 6HU 18 BT3 9JL 18 Name: postcode, dtype: int64
df[df['postcode'] == '']['establishmentname']
101 El Divino 294 Gibsons Butchers 533 Van Shop 540 St Galls Coffee Shop 607 Glenbrook Surestart 672 Glenbrook Surestart 767 Costa Coffee 774 Holohans 776 Cafe Krem Students Union 972 Royal Day Care 1025 Glenbrook Surestart 1030 Malone Kindergarden - Simply Me 1500 Top Centra 1531 Russell's 1763 The Dock Cafe 2154 Select Catering 2158 The Bus Stop Cafe 2161 Firths Traditional Fish & Chips 2162 Attridge & Cole 2165 Bernies Burgers 2170 Top Scoff 2263 Campbell McCleave Ltd 2290 First Presbyterian Church 2540 Pizza Hut (Victoria Square) 2541 TGI Fridays (Victoria Square) 2651 Pizza Express 2699 GIBONEY HOUSE (CLANMIL HOUSING) 2815 Select Sandwiches 2928 Grosvenor Grammar School 2943 St John Paul II 2961 Taughmonagh Nursery School 3045 Malton Fold 3103 The Candy Man UK Name: establishmentname, dtype: object
Assumptions:
df[df.isin((None,'')).any(axis=1)]['establishmentname']
0 Heyn Group 1 Rosemary Lunch Club 2 John Ross & Co Auctioneers 3 The Maverick/Boom Box 4 Maverick 5 Windsor Recreation & Social Cl 6 The Chester Inn 7 The Sportsman 8 City Hibernians Club 9 Forthriver Bowling & Tennis Club 10 Shawsbridge Sports Association 11 Muriels Cafe Bar 12 MCGLONES 13 The Corner House 14 Ewarts Bowling Club 15 Ligoniel Working Mens Club 16 DUNMURRY INN 17 McEnaney's 18 LAUREL GLEN 19 Linfield Football Club - Temporary Clubhouse 20 DUNMURRY RECREATION & FOOTBALL CLUB 21 McKennas 22 Ardoyne Working Mens Club 23 The Stadium Bar 24 Ollies 25 The Harp Bar 26 Shorts Sports & Recreation Club 27 The Spaniard 28 The Bar with No Name 29 Cliftonville Golf Club Catering ... 3138 Lyric Theatre 3139 Strand Arts Centre 3140 Cregagh Video 3141 China Cash and Carry 3142 Camseng International Foods Ltd 3143 Shah Jalal 3144 SANGERS (NI) LIMITED 3145 Makro Self Service Wholesalers Ltd 3146 AAH PHARMACEUTICALS LIMITED (VESTRIC) 3147 Sweet Sales 3148 North Down Group 3149 Air Products PLC 3150 Courtney & Nelson Ltd 3151 Spicers Ltd 3152 Topmark Food Cash and Carry 3153 Musgrave Market Place 3154 Asia Supermarket 3155 Lee Foods 3156 Asia Supermarket 3157 Blacks Catering Supplies 3158 Punjana Ltd 3159 Botl Wine & Spirit Merchants Ltd 3160 Cargo Forwarding Ltd 3161 Selecta 3162 Business Services Organisation - Procurement +... 3163 Beck and Scott Services Ltd 3164 Samskip 3165 Irish Feeds 3166 NATURAL HEALTH PRODUCTS 3167 STOREHOUSE Name: establishmentname, Length: 3168, dtype: object
Well that was a bit pointless; everything is missing something! Our assumption was wrong
New Assumption:
df[df[['postcode','rating','establishmentaddressline3']].isin((None,'')).any(axis=1)]['establishmentname']
10 Shawsbridge Sports Association 26 Shorts Sports & Recreation Club 35 The Farmers 101 El Divino 186 Berlin 280 Butchers Mini Market 294 Gibsons Butchers 454 Junes 533 Van Shop 540 St Galls Coffee Shop 547 Canteen Kitchen Cafe 553 Mornington Community Project 572 White Field Coffee 587 Franklin & James Expresso Bar 590 Loft 596 Boyles at the Linenhall Library 600 Boojum 607 Glenbrook Surestart 616 INTO Queen's University Belfast 640 Cafe Meluzyna 643 Cuban Sandwich Factory 645 Polita Cafe Bistro 672 Glenbrook Surestart 674 Flamenco Patisserie 678 California Coffee 699 Rcity Youth CIC 748 Cafe Royale 767 Costa Coffee 774 Holohans 776 Cafe Krem Students Union ... 2446 Wilgar Drop In Centre 2461 Jasmine 2462 Hotplate Noshery 2465 Bulletproof Burgers 2471 Harry Halls Bistro 2485 General Merchants 2512 Boojum 2519 India Gate 2533 All Seasons 2537 The Belfast Baking Company Limited 2540 Pizza Hut (Victoria Square) 2541 TGI Fridays (Victoria Square) 2542 Five Guys 2651 Pizza Express 2699 GIBONEY HOUSE (CLANMIL HOUSING) 2749 Polita 2815 Select Sandwiches 2817 Clements ( Manufacturer ) 2928 Grosvenor Grammar School 2943 St John Paul II 2961 Taughmonagh Nursery School 2970 St Marys PS Barrack Street.Divis 3045 Malton Fold 3080 Elles Belles 3083 The Coffee Hut 3095 Amberline (Stall at Folktown Market) 3097 Street Dogs (Stall at Folktown Market) 3103 The Candy Man UK 3104 Do me a flavour 3159 Botl Wine & Spirit Merchants Ltd Name: establishmentname, Length: 118, dtype: object
That's all very well and good, but for some of the later challenges, we need to fiddle with things
Data Science is all about tidying stuff up.
df.head()
_id | buildingid | establishmentaddressline1 | establishmentaddressline2 | establishmentaddressline3 | establishmentaddressline4 | establishmentname | inspectiondate | latitude | longitude | postcode | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 185003495 | 1 Corry Place | Belfast | Heyn Group | 2012-08-23T00:00:00 | 375525 | 334581 | BT3 9AH | 5 | ||
1 | 2 | 185746347 | Rosemary Presbyterian Church | 26-36 North Circular Road | Belfast | Rosemary Lunch Club | 2014-10-21T00:00:00 | 378213 | 332938 | BT15 5HD | 5 | |
2 | 3 | 185001807 | 37 Montgomery Street | Belfast | John Ross & Co Auctioneers | 2015-03-19T00:00:00 | 373992 | 334055 | BT1 4NX | 4 | ||
3 | 4 | 185000821 | 108 Donegall Street | Belfast | The Maverick/Boom Box | 2014-11-03T00:00:00 | 374852 | 333733 | BT1 2GX | 5 | ||
4 | 5 | 187129705 | 1 Union Street | Maverick | 2014-02-13T00:00:00 | 378435 | 281262 | BT1 2JF | 4 |
# Looking Good but that _id is a bit pointless
df = df.drop('_id', axis=1)
df.head()
buildingid | establishmentaddressline1 | establishmentaddressline2 | establishmentaddressline3 | establishmentaddressline4 | establishmentname | inspectiondate | latitude | longitude | postcode | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 185003495 | 1 Corry Place | Belfast | Heyn Group | 2012-08-23T00:00:00 | 375525 | 334581 | BT3 9AH | 5 | ||
1 | 185746347 | Rosemary Presbyterian Church | 26-36 North Circular Road | Belfast | Rosemary Lunch Club | 2014-10-21T00:00:00 | 378213 | 332938 | BT15 5HD | 5 | |
2 | 185001807 | 37 Montgomery Street | Belfast | John Ross & Co Auctioneers | 2015-03-19T00:00:00 | 373992 | 334055 | BT1 4NX | 4 | ||
3 | 185000821 | 108 Donegall Street | Belfast | The Maverick/Boom Box | 2014-11-03T00:00:00 | 374852 | 333733 | BT1 2GX | 5 | ||
4 | 187129705 | 1 Union Street | Maverick | 2014-02-13T00:00:00 | 378435 | 281262 | BT1 2JF | 4 |
try:
df['rating'].mean()
except TypeError as e:
print("That didn't work", e)
That didn't work must be str, not int
# DANGER WILL ROBINSON
df.dtypes
buildingid object establishmentaddressline1 object establishmentaddressline2 object establishmentaddressline3 object establishmentaddressline4 object establishmentname object inspectiondate object latitude object longitude object postcode object rating object dtype: object
df['rating'].unique()
array(['5', '4', None, '2', '3', '1'], dtype=object)
There are A LOT of un-rated premises... So we shouldn't just throw them away.
df[df['rating'].isnull()].size
902
(None is the absence of a value, Nan is "Not a Number") If you really care, look up category theory But I don't, so let's jog on.
df['rating'] = df['rating'].astype(float)
df['rating'].unique()
array([ 5., 4., nan, 2., 3., 1.])
Now, what's the average rating?
df['rating'].mean()
4.385288399222294
This by default throws away nan
values, because performing maths on 'nothing' leads to infinities, which is A Bad Thing™
df['rating'].mean(skipna=False)
nan
Who knows about Histograms?
f,ax = plt.subplots()
df['rating'].plot.hist(ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x115556e10>
size
of each rating group:¶df.groupby('rating').size()
rating 1.0 28 2.0 61 3.0 391 4.0 820 5.0 1786 dtype: int64
f,ax = plt.subplots()
df.groupby('rating').size().plot.pie(ax=ax, label='Ratings')
<matplotlib.axes._subplots.AxesSubplot at 0x11567eef0>
df.head()
buildingid | establishmentaddressline1 | establishmentaddressline2 | establishmentaddressline3 | establishmentaddressline4 | establishmentname | inspectiondate | latitude | longitude | postcode | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 185003495 | 1 Corry Place | Belfast | Heyn Group | 2012-08-23T00:00:00 | 375525 | 334581 | BT3 9AH | 5.0 | ||
1 | 185746347 | Rosemary Presbyterian Church | 26-36 North Circular Road | Belfast | Rosemary Lunch Club | 2014-10-21T00:00:00 | 378213 | 332938 | BT15 5HD | 5.0 | |
2 | 185001807 | 37 Montgomery Street | Belfast | John Ross & Co Auctioneers | 2015-03-19T00:00:00 | 373992 | 334055 | BT1 4NX | 4.0 | ||
3 | 185000821 | 108 Donegall Street | Belfast | The Maverick/Boom Box | 2014-11-03T00:00:00 | 374852 | 333733 | BT1 2GX | 5.0 | ||
4 | 187129705 | 1 Union Street | Maverick | 2014-02-13T00:00:00 | 378435 | 281262 | BT1 2JF | 4.0 |
# BORING
df.drop('buildingid', axis=1, inplace=True)
# REDUNDANT
for c in df.keys():
print(c, c.replace('establishment',''))
establishmentaddressline1 addressline1 establishmentaddressline2 addressline2 establishmentaddressline3 addressline3 establishmentaddressline4 addressline4 establishmentname name inspectiondate inspectiondate latitude latitude longitude longitude postcode postcode rating rating
replacements = {c: c.replace('establishment','') for c in df.keys()}
replacements
{'establishmentaddressline1': 'addressline1', 'establishmentaddressline2': 'addressline2', 'establishmentaddressline3': 'addressline3', 'establishmentaddressline4': 'addressline4', 'establishmentname': 'name', 'inspectiondate': 'inspectiondate', 'latitude': 'latitude', 'longitude': 'longitude', 'postcode': 'postcode', 'rating': 'rating'}
df.rename(columns=replacements, inplace=True)
df.head()
addressline1 | addressline2 | addressline3 | addressline4 | name | inspectiondate | latitude | longitude | postcode | rating | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 Corry Place | Belfast | Heyn Group | 2012-08-23T00:00:00 | 375525 | 334581 | BT3 9AH | 5.0 | ||
1 | Rosemary Presbyterian Church | 26-36 North Circular Road | Belfast | Rosemary Lunch Club | 2014-10-21T00:00:00 | 378213 | 332938 | BT15 5HD | 5.0 | |
2 | 37 Montgomery Street | Belfast | John Ross & Co Auctioneers | 2015-03-19T00:00:00 | 373992 | 334055 | BT1 4NX | 4.0 | ||
3 | 108 Donegall Street | Belfast | The Maverick/Boom Box | 2014-11-03T00:00:00 | 374852 | 333733 | BT1 2GX | 5.0 | ||
4 | 1 Union Street | Maverick | 2014-02-13T00:00:00 | 378435 | 281262 | BT1 2JF | 4.0 |
# Something doesn't look right...
df.dtypes
addressline1 object addressline2 object addressline3 object addressline4 object name object inspectiondate object latitude object longitude object postcode object rating float64 dtype: object
df['inspectiondate'] = pd.to_datetime(df['inspectiondate'])
df.head()
addressline1 | addressline2 | addressline3 | addressline4 | name | inspectiondate | latitude | longitude | postcode | rating | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 Corry Place | Belfast | Heyn Group | 2012-08-23 | 375525 | 334581 | BT3 9AH | 5.0 | ||
1 | Rosemary Presbyterian Church | 26-36 North Circular Road | Belfast | Rosemary Lunch Club | 2014-10-21 | 378213 | 332938 | BT15 5HD | 5.0 | |
2 | 37 Montgomery Street | Belfast | John Ross & Co Auctioneers | 2015-03-19 | 373992 | 334055 | BT1 4NX | 4.0 | ||
3 | 108 Donegall Street | Belfast | The Maverick/Boom Box | 2014-11-03 | 374852 | 333733 | BT1 2GX | 5.0 | ||
4 | 1 Union Street | Maverick | 2014-02-13 | 378435 | 281262 | BT1 2JF | 4.0 |
df['dayssinceinspection']=pd.to_datetime('now').date()- df['inspectiondate']
df[['name','inspectiondate','dayssinceinspection']].head()
name | inspectiondate | dayssinceinspection | |
---|---|---|---|
0 | Heyn Group | 2012-08-23 | 2114 days |
1 | Rosemary Lunch Club | 2014-10-21 | 1325 days |
2 | John Ross & Co Auctioneers | 2015-03-19 | 1176 days |
3 | The Maverick/Boom Box | 2014-11-03 | 1312 days |
4 | Maverick | 2014-02-13 | 1575 days |
df.dayssinceinspection.describe()
count 3097 mean 1336 days 09:09:07.497578 std 303 days 12:53:26.331291 min 877 days 00:00:00 25% 1105 days 00:00:00 50% 1290 days 00:00:00 75% 1487 days 00:00:00 max 3957 days 00:00:00 Name: dayssinceinspection, dtype: object
df.sort_values(by='dayssinceinspection', ascending=False).head()
addressline1 | addressline2 | addressline3 | addressline4 | name | inspectiondate | latitude | longitude | postcode | rating | dayssinceinspection | |
---|---|---|---|---|---|---|---|---|---|---|---|
239 | 214 Antrim Road | Belfast | MCCRACKENS B&B | 2007-08-07 | 376040 | 333301 | BT15 2AN | 4.0 | 3957 days | ||
2100 | 5 Corry Place | Belfast | Port Health | 2008-09-01 | 375405 | 334543 | BT3 9HY | 5.0 | 3566 days | ||
2190 | Dunmurry | 144 Laurelbank | NICHOLAS RULES MOBILE VAN | 2009-09-09 | 369869 | 327711 | BT17 0RX | 5.0 | 3193 days | ||
3027 | Clifton House | 2 North Queen Street | Belfast | Helm Housing | 2011-07-20 | 375111 | 333607 | BT15 1EQ | 5.0 | 2514 days | |
1975 | Castlereagh | 19 Castlehill Farm | YUMMIE TREATS | 2011-08-17 | 371496 | 338183 | BT5 7GU | 5.0 | 2486 days |
df.sort_values(by='dayssinceinspection', ascending=False).iloc[0]
addressline1 addressline2 addressline3 214 Antrim Road addressline4 Belfast name MCCRACKENS B&B inspectiondate 2007-08-07 00:00:00 latitude 376040 longitude 333301 postcode BT15 2AN rating 4 dayssinceinspection 3957 days 00:00:00 Name: 239, dtype: object
f,ax = plt.subplots(figsize=graph_figsize)
df.dayssinceinspection.dt.days.plot.hist(ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x115887ba8>
f,ax = plt.subplots(figsize=graph_figsize)
df.dayssinceinspection.dt.days.plot.hist(ax=ax)
first_inspection = df.sort_values(by='dayssinceinspection', ascending=False).iloc[0]
ax.vlines(first_inspection.dayssinceinspection.days, *ax.get_ylim())
<matplotlib.collections.LineCollection at 0x1158daac8>
# Data Cleaning
df.groupby('name').size().sort_values(ascending=False).head()
name Subway 23 Spar 20 Winemark 18 Mace 15 Wineflair 13 dtype: int64
That's some awful strange looking latitude / longitude
df.head()
addressline1 | addressline2 | addressline3 | addressline4 | name | inspectiondate | latitude | longitude | postcode | rating | dayssinceinspection | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 Corry Place | Belfast | Heyn Group | 2012-08-23 | 375525 | 334581 | BT3 9AH | 5.0 | 2114 days | ||
1 | Rosemary Presbyterian Church | 26-36 North Circular Road | Belfast | Rosemary Lunch Club | 2014-10-21 | 378213 | 332938 | BT15 5HD | 5.0 | 1325 days | |
2 | 37 Montgomery Street | Belfast | John Ross & Co Auctioneers | 2015-03-19 | 373992 | 334055 | BT1 4NX | 4.0 | 1176 days | ||
3 | 108 Donegall Street | Belfast | The Maverick/Boom Box | 2014-11-03 | 374852 | 333733 | BT1 2GX | 5.0 | 1312 days | ||
4 | 1 Union Street | Maverick | 2014-02-13 | 378435 | 281262 | BT1 2JF | 4.0 | 1575 days |
from pyproj import Proj, transform
prj_wgs = Proj(proj='latlong',datum='WGS84')
prj_itm = Proj(init='EPSG:29903')
def reproject_itm_to_wgs(eastings, northings):
long, lat = transform(prj_itm, prj_wgs, eastings, northings)
return long, lat
reproject_itm_to_wgs(334581,375525)
(-5.917804141875383, 54.61000494684366)
df.rename(columns={'longitude':'eastings','latitude':'northings'}, inplace=True)
df.head()
addressline1 | addressline2 | addressline3 | addressline4 | name | inspectiondate | northings | eastings | postcode | rating | dayssinceinspection | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 Corry Place | Belfast | Heyn Group | 2012-08-23 | 375525 | 334581 | BT3 9AH | 5.0 | 2114 days | ||
1 | Rosemary Presbyterian Church | 26-36 North Circular Road | Belfast | Rosemary Lunch Club | 2014-10-21 | 378213 | 332938 | BT15 5HD | 5.0 | 1325 days | |
2 | 37 Montgomery Street | Belfast | John Ross & Co Auctioneers | 2015-03-19 | 373992 | 334055 | BT1 4NX | 4.0 | 1176 days | ||
3 | 108 Donegall Street | Belfast | The Maverick/Boom Box | 2014-11-03 | 374852 | 333733 | BT1 2GX | 5.0 | 1312 days | ||
4 | 1 Union Street | Maverick | 2014-02-13 | 378435 | 281262 | BT1 2JF | 4.0 | 1575 days |
def apply_itm_to_wgs(row):
eastings, northings = row['eastings'], row['northings']
if eastings is not None and northings is not None:
long,lat = transform(prj_itm, prj_wgs, eastings, northings)
else:
long,lat = None,None
return pd.Series({'latitude':lat,'longitude':long})
lats_n_longs = df[['eastings','northings']].apply(apply_itm_to_wgs, axis=1)
lats_n_longs.head(10)
latitude | longitude | |
---|---|---|
0 | 54.610005 | -5.917804 |
1 | 54.634570 | -5.941997 |
2 | 54.596383 | -5.926639 |
3 | 54.604188 | -5.931226 |
4 | 54.647520 | -6.741936 |
5 | 54.581714 | -5.947958 |
6 | 54.628103 | -5.939474 |
7 | NaN | NaN |
8 | 54.603692 | -5.931607 |
9 | 54.610187 | -5.963893 |
try:
df = df.join(lats_n_longs)
except ValueError:
# Gets thrown if I accidently run this twice
pass
df.head()
addressline1 | addressline2 | addressline3 | addressline4 | name | inspectiondate | northings | eastings | postcode | rating | dayssinceinspection | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 Corry Place | Belfast | Heyn Group | 2012-08-23 | 375525 | 334581 | BT3 9AH | 5.0 | 2114 days | 54.610005 | -5.917804 | ||
1 | Rosemary Presbyterian Church | 26-36 North Circular Road | Belfast | Rosemary Lunch Club | 2014-10-21 | 378213 | 332938 | BT15 5HD | 5.0 | 1325 days | 54.634570 | -5.941997 | |
2 | 37 Montgomery Street | Belfast | John Ross & Co Auctioneers | 2015-03-19 | 373992 | 334055 | BT1 4NX | 4.0 | 1176 days | 54.596383 | -5.926639 | ||
3 | 108 Donegall Street | Belfast | The Maverick/Boom Box | 2014-11-03 | 374852 | 333733 | BT1 2GX | 5.0 | 1312 days | 54.604188 | -5.931226 | ||
4 | 1 Union Street | Maverick | 2014-02-13 | 378435 | 281262 | BT1 2JF | 4.0 | 1575 days | 54.647520 | -6.741936 |
import geopandas as gp
from shapely.geometry import Point
geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
_df = df.drop(['longitude','latitude'], axis=1)
crs = {'init': 'epsg:4326'}
gdf=gp.GeoDataFrame(_df, crs=crs, geometry=geometry)
gdf.plot(figsize=graph_figsize)
<matplotlib.axes._subplots.AxesSubplot at 0x116197b70>
Search for "Shapefile" on OpendataNI (Also in this repo too)
base_map=gp.GeoDataFrame.from_file("OSNI_Open_Data_Largescale_Boundaries__NI_Outline.shp")
base_map.plot(figsize=graph_figsize)
<matplotlib.axes._subplots.AxesSubplot at 0x1162ca4e0>
f,ax = plt.subplots(figsize=graph_figsize)
base_map.plot(ax=ax, color='gray')
gdf.dropna().plot(column='rating',ax=ax, alpha=0.5, legend=True, cmap='RdYlGn')
<matplotlib.axes._subplots.AxesSubplot at 0x115fedb70>
min_max = df[['latitude','longitude']].agg(['min','max'])
min_max
latitude | longitude | |
---|---|---|
min | 54.526609 | -6.741936 |
max | 54.677225 | -5.813576 |
f,ax = plt.subplots(figsize=graph_figsize)
base_map.plot(ax=ax, color='gray')
gdf.dropna().plot(column='rating',ax=ax, alpha=0.5, legend=True, cmap='RdYlGn')
ax.set_xlim(min_max['longitude'].values)
ax.set_ylim(min_max['latitude'].values)
(54.52660878238531, 54.67722468503716)
f,ax = plt.subplots(figsize=graph_figsize)
base_map.plot(ax=ax, color='gray')
gdf.dropna().plot(column='rating',ax=ax, alpha=0.5, legend=True, cmap='RdYlGn')
ax.set_xlim(min_max['longitude'].values)
ax.set_ylim(min_max['latitude'].values)
ax.set_aspect('equal')
percentiles = df[['latitude','longitude']].quantile([0.01,0.99])
percentiles
latitude | longitude | |
---|---|---|
0.01 | 54.550063 | -6.026184 |
0.99 | 54.644132 | -5.840460 |
f,ax = plt.subplots(figsize=graph_figsize)
base_map.plot(ax=ax, color='gray')
gdf.dropna().plot(column='rating',ax=ax, alpha=0.5, legend=True, cmap='RdYlGn')
ax.set_xlim(percentiles['longitude'].values)
ax.set_ylim(percentiles['latitude'].values)
ax.set_aspect('equal')
Bonus Round If there's something you think government has that you want and haven't seen, request it.