#!/usr/bin/env python # coding: utf-8 # # Search narratives # # Most NTSB reports include a narrative written by the investigators. This notebook searches them for references to mast bumping. # In[1]: import os import pandas as pd # Read in the narratives. # In[2]: get_ipython().run_line_magic('store', '-r input_dir') get_ipython().run_line_magic('store', '-r output_dir') # In[3]: read_df = lambda name: pd.read_csv(os.path.join(output_dir, name)) # In[4]: narratives = read_df("narratives.csv") # Join them to fatal U.S. helicopter accidents. # In[5]: helicopter_by_accident = read_df("standardized-helicopters-by-accident.csv") # In[6]: us_helicopter_by_accident = helicopter_by_accident[helicopter_by_accident.in_usa == True] # In[7]: merged = pd.merge( us_helicopter_by_accident, narratives, on=["event_id", "aircraft_id"] ) # Search them for terms related to mast bumping. # In[8]: def search(df, string): """ Searches the provided DataFrame's columns for the provided string. Returns the filtered result as a new DataFrame. """ result_rows = [] for c in df.dtypes[df.dtypes == 'object'].index: result_rows.append(df[df[c].str.lower().str.contains(string.lower(), na=False)]) return pd.concat(result_rows).drop_duplicates() # In[9]: hits = pd.concat([ search(merged, "mast bumping"), search(merged, "rocking"), search(merged, "vibration"), search(merged, "mast bump "), ]).drop_duplicates() # In[12]: len(hits) # Output the result. # In[1]: hits.sort_values("event_id", ascending=True).to_csv(os.path.join(output_dir, "searched-narratives.csv"), encoding="utf-8", index=False) # In[ ]: