# import libraries
import pandas as pd
import numpy as n
from sqlalchemy import create_engine
# load messages dataset
messages = pd.read_csv('data/disaster_messages.csv')
messages.head()
id | message | original | genre | |
---|---|---|---|---|
0 | 2 | Weather update - a cold front from Cuba that c... | Un front froid se retrouve sur Cuba ce matin. ... | direct |
1 | 7 | Is the Hurricane over or is it not over | Cyclone nan fini osinon li pa fini | direct |
2 | 8 | Looking for someone but no name | Patnm, di Maryani relem pou li banm nouvel li ... | direct |
3 | 9 | UN reports Leogane 80-90 destroyed. Only Hospi... | UN reports Leogane 80-90 destroyed. Only Hospi... | direct |
4 | 12 | says: west side of Haiti, rest of the country ... | facade ouest d Haiti et le reste du pays aujou... | direct |
# load categories dataset
categories = pd.read_csv('data/disaster_categories.csv')
categories.head()
id | categories | |
---|---|---|
0 | 2 | related-1;request-0;offer-0;aid_related-0;medi... |
1 | 7 | related-1;request-0;offer-0;aid_related-1;medi... |
2 | 8 | related-1;request-0;offer-0;aid_related-0;medi... |
3 | 9 | related-1;request-1;offer-0;aid_related-1;medi... |
4 | 12 | related-1;request-0;offer-0;aid_related-0;medi... |
df
, which will be cleaned in the following steps# merge datasets
df = pd.merge(messages, categories, on = 'id')
df.head()
id | message | original | genre | categories | |
---|---|---|---|---|---|
0 | 2 | Weather update - a cold front from Cuba that c... | Un front froid se retrouve sur Cuba ce matin. ... | direct | related-1;request-0;offer-0;aid_related-0;medi... |
1 | 7 | Is the Hurricane over or is it not over | Cyclone nan fini osinon li pa fini | direct | related-1;request-0;offer-0;aid_related-1;medi... |
2 | 8 | Looking for someone but no name | Patnm, di Maryani relem pou li banm nouvel li ... | direct | related-1;request-0;offer-0;aid_related-0;medi... |
3 | 9 | UN reports Leogane 80-90 destroyed. Only Hospi... | UN reports Leogane 80-90 destroyed. Only Hospi... | direct | related-1;request-1;offer-0;aid_related-1;medi... |
4 | 12 | says: west side of Haiti, rest of the country ... | facade ouest d Haiti et le reste du pays aujou... | direct | related-1;request-0;offer-0;aid_related-0;medi... |
categories
into separate category columns.¶categories
column on the ;
character so that each value becomes a separate column. You'll find this method very helpful! Make sure to set expand=True
.categories
with new column names.# create a dataframe of the 36 individual category columns
categories = df['categories'].str.split(';', expand=True)
categories.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | related-1 | request-0 | offer-0 | aid_related-0 | medical_help-0 | medical_products-0 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-0 | floods-0 | storm-0 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
1 | related-1 | request-0 | offer-0 | aid_related-1 | medical_help-0 | medical_products-0 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-1 | floods-0 | storm-1 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
2 | related-1 | request-0 | offer-0 | aid_related-0 | medical_help-0 | medical_products-0 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-0 | floods-0 | storm-0 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
3 | related-1 | request-1 | offer-0 | aid_related-1 | medical_help-0 | medical_products-1 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-0 | floods-0 | storm-0 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
4 | related-1 | request-0 | offer-0 | aid_related-0 | medical_help-0 | medical_products-0 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-0 | floods-0 | storm-0 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
5 rows × 36 columns
# select the first row of the categories dataframe
row = categories.iloc[1]
# use this row to extract a list of new column names for categories.
# one way is to apply a lambda function that takes everything
# up to the second to last character of each string with slicing
category_colnames = row.apply(lambda x: x.split('-')[0])
print(category_colnames)
0 related 1 request 2 offer 3 aid_related 4 medical_help 5 medical_products 6 search_and_rescue 7 security 8 military 9 child_alone 10 water 11 food 12 shelter 13 clothing 14 money 15 missing_people 16 refugees 17 death 18 other_aid 19 infrastructure_related 20 transport 21 buildings 22 electricity 23 tools 24 hospitals 25 shops 26 aid_centers 27 other_infrastructure 28 weather_related 29 floods 30 storm 31 fire 32 earthquake 33 cold 34 other_weather 35 direct_report Name: 1, dtype: object
# rename the columns of `categories`
categories.columns = category_colnames
categories.head()
1 | related | request | offer | aid_related | medical_help | medical_products | search_and_rescue | security | military | child_alone | ... | aid_centers | other_infrastructure | weather_related | floods | storm | fire | earthquake | cold | other_weather | direct_report |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | related-1 | request-0 | offer-0 | aid_related-0 | medical_help-0 | medical_products-0 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-0 | floods-0 | storm-0 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
1 | related-1 | request-0 | offer-0 | aid_related-1 | medical_help-0 | medical_products-0 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-1 | floods-0 | storm-1 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
2 | related-1 | request-0 | offer-0 | aid_related-0 | medical_help-0 | medical_products-0 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-0 | floods-0 | storm-0 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
3 | related-1 | request-1 | offer-0 | aid_related-1 | medical_help-0 | medical_products-1 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-0 | floods-0 | storm-0 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
4 | related-1 | request-0 | offer-0 | aid_related-0 | medical_help-0 | medical_products-0 | search_and_rescue-0 | security-0 | military-0 | child_alone-0 | ... | aid_centers-0 | other_infrastructure-0 | weather_related-0 | floods-0 | storm-0 | fire-0 | earthquake-0 | cold-0 | other_weather-0 | direct_report-0 |
5 rows × 36 columns
related-0
becomes 0
, related-1
becomes 1
. Convert the string to a numeric value..str
after the Series. You may need to first convert the Series to be of type string, which you can do with astype(str)
.for column in categories:
# set each value to be the last character of the string
categories[column] = categories[column].apply(lambda x: int(x.split('-')[1]))
categories.head()
1 | related | request | offer | aid_related | medical_help | medical_products | search_and_rescue | security | military | child_alone | ... | aid_centers | other_infrastructure | weather_related | floods | storm | fire | earthquake | cold | other_weather | direct_report |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 36 columns
categories
column in df
with new category columns.¶# drop the original categories column from `df`
df.drop('categories', axis = 1, inplace = True)
df.head()
id | message | original | genre | |
---|---|---|---|---|
0 | 2 | Weather update - a cold front from Cuba that c... | Un front froid se retrouve sur Cuba ce matin. ... | direct |
1 | 7 | Is the Hurricane over or is it not over | Cyclone nan fini osinon li pa fini | direct |
2 | 8 | Looking for someone but no name | Patnm, di Maryani relem pou li banm nouvel li ... | direct |
3 | 9 | UN reports Leogane 80-90 destroyed. Only Hospi... | UN reports Leogane 80-90 destroyed. Only Hospi... | direct |
4 | 12 | says: west side of Haiti, rest of the country ... | facade ouest d Haiti et le reste du pays aujou... | direct |
# concatenate the original dataframe with the new `categories` dataframe
df = df.join(categories)
df.head()
id | message | original | genre | related | request | offer | aid_related | medical_help | medical_products | ... | aid_centers | other_infrastructure | weather_related | floods | storm | fire | earthquake | cold | other_weather | direct_report | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | Weather update - a cold front from Cuba that c... | Un front froid se retrouve sur Cuba ce matin. ... | direct | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 7 | Is the Hurricane over or is it not over | Cyclone nan fini osinon li pa fini | direct | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 8 | Looking for someone but no name | Patnm, di Maryani relem pou li banm nouvel li ... | direct | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 9 | UN reports Leogane 80-90 destroyed. Only Hospi... | UN reports Leogane 80-90 destroyed. Only Hospi... | direct | 1 | 1 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 12 | says: west side of Haiti, rest of the country ... | facade ouest d Haiti et le reste du pays aujou... | direct | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 40 columns
# check number of duplicates
sum(df.duplicated())
170
# drop duplicates
df = df.drop_duplicates()
# check number of duplicates
sum(df.duplicated())
0
You can do this with pandas to_sql
method combined with the SQLAlchemy library. Remember to import SQLAlchemy's create_engine
in the first cell of this notebook to use it below.
engine = create_engine('sqlite:///DisasterResponse.db')
df.to_sql('DisasterResponse', engine, index=False)
etl_pipeline.py
¶Use the template file attached in the Resources folder to write a script that runs the steps above to create a database based on new datasets specified by the user. Alternatively, you can complete etl_pipeline.py
in the classroom on the Project Workspace IDE
coming later.