#!/usr/bin/env python
# coding: utf-8

# # Houston flood zone analysis
# 
# By [Ben Welsh](http://www.latimes.com/la-bio-ben-welsh-staff.html)
# 
# A geospatial analysis of Houston property records conducted for the Nov. 8, 2017 Los Angeles Times story ["How Houston’s newest homes survived Hurricane Harvey"](http://www.latimes.com/projects/la-na-houston-harvey-home-survivors/). 
# 
# Nearly a quarter of new Houston homes in the past decade were built within a federal flood zone. To look at damage there, the Times chose five neighborhoods built inside a flood zone over the last decade and asked a sample of homeowners in each area how they fared. In case after case, owners of the newer homes rode out the hurricane safely.
# 
# The brunt of the damage appears to be borne by older houses — sometimes just doors away from newer, unscathed real estate — that predate federal and local anti-flooding regulations and account for 79% of the nearly 129,000 houses located in the federal flood zones.
# 
# To learn more about the survey, the context and our findings, [read the story](http://www.latimes.com/projects/la-na-houston-harvey-home-survivors/). Below you can find the Python code that analyzed and where, and when, homes were built inside federal flood zones.

# ### Download the source data

# Our sources were:
# 
# 1. Flood zone maps from the Federal Emergency Management Agency
# 2. Property records and parcel maps from the Harris County Appraisal District

# In[ ]:


get_ipython().run_line_magic('run', './01_download.ipynb')


# Some of the source files must be downloaded and unpackaged manually. To recreate the analysis you should refer to the documentation in the [download notebook](01_download.ipynb) and unpack them yourself.

# ### Transform the source data

# [The next notebook](02_transform.ipynb) converts the source data files into comma-delimited format, standardizes geospatial reference systems, computes additional columns, filters out superfluous records and trims off unneeded metadata.

# In[ ]:


get_ipython().run_line_magic('run', './02_transform.ipynb')


# ### Merge the source data

# [This notebook](03_merge.ipynb) mints files ready for analysis for merging the buildings data with parcel maps

# In[ ]:


get_ipython().run_line_magic('run', './03_merge.ipynb')


# ### Spatially join buildings data with flood zones 

# Now we begin the analysis by importing our Python tools.

# In[1]:


import os
import geopandas as gpd
import matplotlib.pyplot as plt


# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')


# Prepare utilities for opening files

# In[3]:


get_output_path = lambda x: os.path.join("./output", x)


# In[4]:


get_gdf = lambda x: gpd.read_file(get_output_path(x))


# Because the source data files are so large, a single spatial join is difficult to execute. To work around the limitations of our available hardware, the following code splits up the shapefiles into chunks based on the year each building was built.

# In[5]:


year_list = gpd.pd.np.arange(1800, 2017)


# In[6]:


def split_year(df_name, func):
    """
    Split the provided dataframe into separate files using all the years in our dataset.
    """
    # Open the dataframe
    df = get_gdf(df_name)
    
    # Loop through all the years we're watching
    for year in year_list:
        
        # For each year pass the dataframe to the provided function (for filtering or analysis)
        year_df = func(df, year)
       
        # Write out the results
        if len(year_df) > 0:
            year_name = "{}-{}".format(year, df_name)
            year_path = get_output_path(year_name)
            year_df.to_file(year_path)
    
    # Pass back out the dataframe since it's already open. We might reuse it.
    return df


# The master shapefile is read in and split into many pieces.

# In[28]:


buildings = split_year("buildings-and-parcels.shp", lambda x, y: x[x.erected == y])


# Each year's file is then spatial joined with the flood zones and output into a file with the matches.

# In[7]:


def read_year(year, name_template="{}-buildings-and-parcels.shp"):
    """
    Returns the split slice of residential parcels for the given year.
    """
    try:
        # Go grab the file for this year
        return get_gdf(name_template.format(year))
    except IOError:
        # If there's no file, just return None
        return None


# In[8]:


def sjoin_year(parcels, flood_zones):
    """
    Conducts a spatial join between the provided parcels and flood zones.
    
    Returns parcels that intersect with the zones.
    """
    if isinstance(parcels, type(None)):
        return gpd.GeoDataFrame()
    sjoin = gpd.sjoin(parcels, flood_zones, how="inner", op='intersects')
    return parcels[parcels.account.isin(sjoin.account)]


# In[9]:


zones_500 = split_year("five-hundred-year-flood-zones.shp", lambda shp, y: sjoin_year(read_year(y), shp))
zones_100 = split_year("one-hundred-year-flood-zones.shp", lambda shp, y: sjoin_year(read_year(y), shp))


# All the matches are read in and combined into a single shapefile with all of the buildings found within a flood zone.

# In[10]:


def concat_years(df_name):
    """
    Combine all files split by year with the provided pattern into a single dataframe.
    """
    # Pull all the annual files
    df_list = [
        read_year(year, "{}-%s" % df_name)
            for year in year_list
    ]
    
    # Lump them together into one big dataframe 
    df = gpd.pd.concat([i for i in df_list if isinstance(i, gpd.GeoDataFrame)])
    
    # Convert the dataframe into a geodataframe
    geometry = df['geometry']
    df = df.drop('geometry', axis=1)
    crs = {'init': 'epsg:4269'}
    gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
    
    # Return the result
    return gdf


# In[11]:


in_zone_500 = concat_years("five-hundred-year-flood-zones.shp")
in_zone_100 = concat_years("one-hundred-year-flood-zones.shp")


# Some buildings intersect partially with both a 100 year and 500 year flood zone. We remove the duplicates using a conservative methodology that places buildings in both within the less risky 500 year zone.

# In[12]:


in_zone_100_only = in_zone_100[~(in_zone_100.account.isin(in_zone_500.account))]


# Create a combined dataframe with both types after the duplicates have been excised.

# In[13]:


in_any_zone = gpd.pd.concat([in_zone_500, in_zone_100_only])


# Reset indexes to tidy up the data.

# In[18]:


in_zone_500 = in_zone_500.reset_index()
in_zone_100 = in_zone_100.reset_index()
in_zone_100_only = in_zone_100_only.reset_index()
in_any_zone = in_any_zone.reset_index()


# Write out shapefiles of the results.

# In[19]:


in_zone_500.to_file(get_output_path("./buildings-in-five-hundred-year-flood-zones.shp"))
in_zone_100_only.to_file(get_output_path("./buildings-in-one-hundred-year-flood-zones.shp"))
in_any_zone.to_file(get_output_path("./buildings-in-any-flood-zone.shp"))


# Filter down to just single-family homes, as instructed by officials at the Harris County Appraisal District.

# In[20]:


homes_in_zone_500 = in_zone_500[in_zone_500.use == 'A1']
homes_in_zone_100_only = in_zone_100_only[in_zone_100_only.use == 'A1']
homes_in_any_zone = in_any_zone[in_any_zone.use == 'A1']


# Write out shapefiles of those.

# In[21]:


homes_in_zone_500.to_file(get_output_path("./homes-in-five-hundred-year-flood-zones.shp"))
homes_in_zone_100_only.to_file(get_output_path("./homes-in-one-hundred-year-flood-zones.shp"))
homes_in_any_zone.to_file(get_output_path("./homes-in-any-flood-zone.shp"))


# Filter down again to just homes that fall within the Houston city limits, as instructed by officials at the Harris County Appraisal District.

# In[22]:


houston_homes_in_zone_500 = homes_in_zone_500[homes_in_zone_500.houston == 'True']
houston_homes_in_zone_100_only = homes_in_zone_100_only[homes_in_zone_100_only.houston == 'True']
houston_homes_in_any_zone = homes_in_any_zone[homes_in_any_zone.houston == 'True']


# Write out shapefiles of those.

# In[23]:


houston_homes_in_zone_500.to_file(get_output_path("./houston-homes-in-five-hundred-year-flood-zones.shp"))
houston_homes_in_zone_100_only.to_file(get_output_path("./houston-homes-in-one-hundred-year-flood-zones.shp"))
houston_homes_in_any_zone.to_file(get_output_path("./houston-homes-in-any-flood-zone.shp"))


# Clear out split files generated above.

# In[ ]:


get_ipython().system('rm ./output/18*-*')
get_ipython().system('rm ./output/19*-*')
get_ipython().system('rm ./output/20*-*')


# ### Calculate annual totals

# Prepare functions for running the numbers.

# In[24]:


def groupby_year(df, value_name):
    """
    Returns a regrouped dataframe using the 'erected' field that contains year of construction.
    """
    grouped = df.groupby("erected").size().reset_index()
    grouped.columns = ['year', value_name]
    return grouped


# In[25]:


def crosstab(universe_df, in_zone_500_df, in_zone_100_df, func=groupby_year, on="year"):
    """
    Returns a dataframe crosstab with counts and percentages of buildings constructed within flood zones.
    """
    # Group all three sets by year
    universe_by_func = func(universe_df, "total")
    in_zone_500_by_func = func(in_zone_500_df, "in_zone_500")
    in_zone_100_by_func = func(in_zone_100_df, "in_zone_100")

    # Create a crosstab
    crosstab = gpd.pd.merge(
        universe_by_func,
        gpd.pd.merge(in_zone_100_by_func, in_zone_500_by_func, how="outer", on=on),
        how="left",
        on=on
    )
    
    # Clean it up
    crosstab.fillna(0, inplace=True)
    
    # Add total and percentage columns
    crosstab['in_zone'] = crosstab['in_zone_100'] + crosstab['in_zone_500']
    crosstab['in_zone_percent'] = crosstab['in_zone'] / crosstab['total']
    
    # Trim it
    trimmed = crosstab[((crosstab[on] > 1500) & (crosstab[on] < 2017))]
    
    # Pass it back
    return trimmed


# Run the totals for all homes in Harris County

# In[31]:


homes = buildings[buildings.use == 'A1']
homes_by_year = crosstab(homes, homes_in_zone_500, homes_in_zone_100_only)


# Run the totals for homes in the Houston city limits.

# In[32]:


houston_homes = homes[homes.houston == 'True']
houston_homes_by_year = crosstab(houston_homes, houston_homes_in_zone_500, houston_homes_in_zone_100_only)


# ### Interview the data

# In[33]:


def compare(numerator, denominator):
    print "{:,} / {:,} ({:.2%})".format(numerator, denominator, numerator / denominator)


# #### What's the total number of homes in flood zones?

# Overall

# In[34]:


compare(homes_by_year['in_zone'].sum(), homes_by_year['total'].sum())
compare(houston_homes_by_year['in_zone'].sum(), houston_homes_by_year['total'].sum())


# 100 year zone

# In[35]:


compare(homes_by_year['in_zone_100'].sum(), homes_by_year['total'].sum())
compare(houston_homes_by_year['in_zone_100'].sum(), houston_homes_by_year['total'].sum())


# 500 year zone

# In[36]:


compare(homes_by_year['in_zone_500'].sum(), homes_by_year['total'].sum())
compare(houston_homes_by_year['in_zone_500'].sum(), houston_homes_by_year['total'].sum())


# #### What are those same numbers over the past 10 years?

# In[37]:


homes_since_2007 = homes_by_year[homes_by_year.year >= 2007]
houston_homes_since_2007 = houston_homes_by_year[houston_homes_by_year.year >= 2007]


# In[38]:


compare(homes_since_2007['in_zone'].sum(), homes_since_2007['total'].sum())
compare(houston_homes_since_2007['in_zone'].sum(), houston_homes_since_2007['total'].sum())


# In[39]:


compare(homes_since_2007['in_zone_100'].sum(), homes_since_2007['total'].sum())
compare(houston_homes_since_2007['in_zone_100'].sum(), houston_homes_since_2007['total'].sum())


# In[40]:


compare(homes_since_2007['in_zone_500'].sum(), homes_since_2007['total'].sum())
compare(houston_homes_since_2007['in_zone_500'].sum(), houston_homes_since_2007['total'].sum())


# #### What share of Houston homes in flood zones were built before 1985?
# 
# That's the year when Houston's local ordinances regulating flood zone elevations went into effect.

# In[41]:


homes_before_rules = homes_by_year[homes_by_year.year < 1985]
houston_homes_before_rules = houston_homes_by_year[houston_homes_by_year.year < 1985]


# In[42]:


compare(homes_before_rules['in_zone'].sum(), homes_by_year['in_zone'].sum())
compare(houston_homes_before_rules['in_zone'].sum(), houston_homes_by_year['in_zone'].sum())


# In[43]:


compare(homes_before_rules['in_zone_100'].sum(), homes_by_year['in_zone_100'].sum())
compare(houston_homes_before_rules['in_zone_100'].sum(), houston_homes_by_year['in_zone_100'].sum())


# In[44]:


compare(homes_before_rules['in_zone_500'].sum(), homes_by_year['in_zone_500'].sum())
compare(houston_homes_before_rules['in_zone_500'].sum(), houston_homes_by_year['in_zone_500'].sum())


# ### Chart the trend over time

# Filter down the data to the time range we want to chart.

# In[45]:


start_year = 1930


# Total buildings countywide by year

# In[46]:


chart_df = homes_by_year[(homes_by_year.year > start_year)]


# In[47]:


fig = plt.figure(figsize=(20, 10))
ax = plt.bar(gpd.pd.np.arange(start_year, 2016), chart_df.total)
ax = plt.bar(gpd.pd.np.arange(start_year, 2016), chart_df.in_zone)


# Total buildings in Houston by year

# In[48]:


houston_chart_df = houston_homes_by_year[(houston_homes_by_year.year > start_year)]


# In[49]:


fig = plt.figure(figsize=(20, 10))
ax = plt.bar(gpd.pd.np.arange(start_year, 2016), houston_chart_df.total)
ax = plt.bar(gpd.pd.np.arange(start_year, 2016), houston_chart_df.in_zone)


# Percentage of buildings in flood zones countywide by year

# In[50]:


fig = plt.figure(figsize=(20, 10))
ax = plt.bar(gpd.pd.np.arange(start_year, 2016), [1 for i in gpd.pd.np.arange(start_year, 2016)])
ax = plt.bar(gpd.pd.np.arange(start_year, 2016), chart_df.in_zone_percent)


# Percentage of Houston buildings in flood zones by year

# In[51]:


fig = plt.figure(figsize=(20, 10))
ax = plt.bar(gpd.pd.np.arange(start_year, 2016), [1 for i in gpd.pd.np.arange(start_year, 2016)])
ax = plt.bar(gpd.pd.np.arange(start_year, 2016), houston_chart_df.in_zone_percent)


# ### Write out the data for graphics

# Ultimately we smoothed data before 1975 to five-year averages due to imprecisions described by officials at the Harris County Appraisal District.

# In[52]:


homes_by_year.to_csv("./output/homes-by-year.csv")


# In[53]:


houston_homes_by_year.to_csv("./output/houston-homes-by-year.csv")