#!/usr/bin/env python
# coding: utf-8

# # Search narratives
# 
# Most NTSB reports include a narrative written by the investigators. This notebook searches them for references to mast bumping.

# In[1]:


import os
import pandas as pd


# Read in the narratives.

# In[2]:


get_ipython().run_line_magic('store', '-r input_dir')
get_ipython().run_line_magic('store', '-r output_dir')


# In[3]:


read_df = lambda name: pd.read_csv(os.path.join(output_dir, name))


# In[4]:


narratives = read_df("narratives.csv")


# Join them to fatal U.S. helicopter accidents.

# In[5]:


helicopter_by_accident = read_df("standardized-helicopters-by-accident.csv")


# In[6]:


us_helicopter_by_accident = helicopter_by_accident[helicopter_by_accident.in_usa == True]


# In[7]:


merged = pd.merge(
    us_helicopter_by_accident,
    narratives,
    on=["event_id", "aircraft_id"]
)


# Search them for terms related to mast bumping.

# In[8]:


def search(df, string):
    """
    Searches the provided DataFrame's columns for the provided string.
    
    Returns the filtered result as a new DataFrame.
    """
    result_rows = []
    for c in df.dtypes[df.dtypes == 'object'].index:
        result_rows.append(df[df[c].str.lower().str.contains(string.lower(), na=False)])
    return pd.concat(result_rows).drop_duplicates()


# In[9]:


hits = pd.concat([
    search(merged, "mast bumping"),
    search(merged, "rocking"),
    search(merged, "vibration"),
    search(merged, "mast bump "),
]).drop_duplicates()


# In[12]:


len(hits)


# Output the result.

# In[1]:


hits.sort_values("event_id", ascending=True).to_csv(os.path.join(output_dir, "searched-narratives.csv"), encoding="utf-8", index=False)


# In[ ]: