#!/usr/bin/env python # coding: utf-8 # # Exploratory Data Analysis of the West Nile Virus Dataset # In[1]: from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # see the value of multiple statements at once. import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) # ### Importing the data # In[2]: weather = pd.read_csv('weather.csv') train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') spray = pd.read_csv('spray.csv') #sample = pd.read_csv('sampleSubmission.csv') # ### Data dictionary # #### train.csv and test.csv # # - Id: the id of the record # - Date: date that the WNV test is performed # - Address: approximate address of the location of trap. This is used to send to the GeoCoder. # - Species: the species of mosquitos # - Block: block number of address # - Street: street name # - Trap: Id of the trap # - AddressNumberAndStreet: approximate address returned from GeoCoder # - Latitude, Longitude: Latitude and Longitude returned from GeoCoder # - AddressAccuracy: accuracy returned from GeoCoder # - NumMosquitos: number of mosquitoes caught in this trap # - WnvPresent: whether West Nile Virus was present in these mosquitos. 1 means WNV is present, and 0 means not present. # # #### spray.csv # # - Date, Time: the date and time of the spray # - Latitude, Longitude: the Latitude and Longitude of the spray # # # #### weather.csv # - Column descriptions in noaa_weather_qclcd_documentation.pdf. # ### Questions from the project instructions: # # 1. Describe the data # 2. What does it represent? # 3. What types are present? # 4. What does each data points' distribution look like? # 5. What kind of cleaning is needed? # 6. Document any potential issues that will need to be resolved # ### We first define a function to perform some steps of the EDA: # In[3]: def eda(df): print("1) Are there missing values:") if df.isnull().any().unique().shape[0] == 2: if df.isnull().any().unique()[0] == False and df.isnull().any().unique()[1] == False: print('No\n') else: print("Yes|Percentage of missing values in each column:\n",df.isnull().sum()/df.shape[0],'\n') elif df.isnull().any().unique().shape[0] == 1: if df.isnull().any().unique() == False: print('No\n') else: print("Yes|Percentage of missing values in each column:\n",df.isnull().sum()/df.shape[0],'\n') print("2) Which are the data types:\n") print(df.dtypes,'\n') print("3) Dataframe shape:",df.shape) print("4) Unique values per columm") for col in df.columns.tolist(): print (col,":",df[col].nunique()) print("5) Removing duplicates") print('Initial shape:',df.shape) df.groupby(df.columns.tolist()).size().reset_index().rename(columns={0:'count'}).sort_values('count',ascending=False).head() df.drop_duplicates(inplace=True) print('Shape after removing duplicates:',df.shape) return # ## 1) Looking at `train` `DataFrame`: # In[4]: train = pd.read_csv('train.csv') train.head(2) print("Number of duplicates:",train.duplicated().sum()) # ### Feature do be removed # - Address features are redundant some of them can be removed # - `NumMosquitos` and `WnvPresent` are not in the test set. I will remove the first since the number of mosquitos is less relevant than whether West Nile Virus was present in these mosquitos. # In[5]: print(train.columns.tolist()) # In[6]: cols_to_keep = ['Date', 'Species', 'Trap','Latitude', 'Longitude', 'WnvPresent'] train = train[cols_to_keep] train.head() # #### There are many duplicates # In[7]: train.shape train[train.duplicated(keep=False)].head() # examples of duplicates train[train.duplicated(keep=False)].shape # Using the argument `df` equal to the training set `train`. It removes duplicates among other things: # In[8]: eda(train) # In[9]: print("Number of duplicates:",train.duplicated().sum()) # Sanity check # ### Comments: # - There are many duplicates which were removed using the function `eda( )` # - Only `Species` can be transformed into dummies. The others have too many unique values. # - We should examine categorical columns to see if they are unbalanced: # - Using `value_counts` we find that the `WnvPresent` column is highly unbalanced with $\approx$ 95$\%$ of zeros. # In[10]: round(100*train['WnvPresent'].value_counts()/train.shape[0],0) # In[11]: train.to_csv('train_new.csv') # ### Creating dummies from `Species` # In[12]: train = pd.read_csv('train_new.csv',index_col=0) train['Date'] = pd.to_datetime(train['Date']) train = pd.concat([train,pd.get_dummies(train['Species'], drop_first = True)], axis = 1) train.drop('Species', inplace=True, axis=1) train.head(2) train.dtypes # ### Build a `DataFrame` with the dates broken into pieces # In[13]: train2 = train.copy() train2['Year']= train2.Date.dt.year train2['DayofYear']= train2.Date.dt.dayofyear train2.drop('Date', inplace=True, axis=1) train2.head() # ### Exporting `train` and `train2` after EDA # # The `DataFrame` with the full `Date` is kept because it may be useful for merging different dataframes. Hence: # - `'train_after_eda.csv'` has a `Date` column # - `'train_after_eda_without_date.csv'` has no `Date` column but columns `Year` and `DayofYear` # In[14]: train.to_csv('train_after_eda.csv') train2.to_csv('train_after_eda_without_date.csv') # ## 2) Applying similar changes to the test data # In[15]: cols_to_keep_test = ['Date', 'Species','Trap', 'Latitude', 'Longitude'] # In[16]: test = pd.read_csv('test.csv') test['Date'] = pd.to_datetime(test['Date']) test = test[cols_to_keep_test] test.head() print("Number of duplicates:",test.duplicated().sum()) # In[17]: test = pd.concat([test,pd.get_dummies(test['Species'], drop_first = True)], axis = 1) test.drop('Species', inplace=True, axis=1) test.head() # ### Build a `DataFrame` with the dates broken into pieces # In[18]: test2 = test.copy() test2['Year']= test2.Date.dt.year test2['DayofYear']= test2.Date.dt.dayofyear test2.drop('Date', inplace=True, axis=1) test2.head() # ### Exporting `test` and `test2` after EDA # In[19]: test.to_csv('test_after_eda.csv') test2.to_csv('test_after_eda_without_date.csv') # ## 3) Now, we look at the `spray` data and perform similar steps: # In[20]: spray = pd.read_csv('spray.csv') spray['Date'] = pd.to_datetime(spray['Date']) spray.head() spray.dtypes # ### Print out duplicates # In[21]: spray[spray.duplicated(keep=False)].head() # In[22]: eda(spray) # ### Sanity check # In[23]: spray[spray.duplicated(keep=False)] # Indeed, printing out the `DataFrame` we see that there are several `NaNs` but the percentage is low. # In[24]: spray.isnull().sum() spray[spray['Time'].isnull()].head() print('% of NaNs in the `Time` column:',round(spray[spray['Time'].isnull()].shape[0]/spray.shape[0],2)) # #### We can either drop `NaNs` or remove the column altogether. The second option seems to make more sense since time does look like a relevant variable # In[25]: #spray.dropna(inplace=True) spray.drop('Time', inplace=True, axis=1) spray.head() # In[26]: spray.isnull().sum() # Sanity check # In[27]: spray2 = spray.copy() spray2['Year']= spray2.Date.dt.year spray2['DayofYear']= spray2.Date.dt.dayofyear spray2.drop('Date', inplace=True, axis=1) spray2.head() # In[28]: spray.to_csv('spray_after_eda.csv') spray2.to_csv('spray_after_eda_without_date.csv') # ## 4) Looking at the `weather` `DataFrame`: # In[29]: weather = pd.read_csv('weather.csv') # #### Print a `DataFrame` of duplicates rows and its shape # In[30]: weather[weather.duplicated(keep=False)].head(2) weather[weather.duplicated(keep=False)].shape[0] # In[31]: eda(weather) # The `Water1` column has just 1 value namely `M` and the latter means missing. We remove this column. # In[32]: weather['Water1'].value_counts() weather['Water1'].nunique() weather['Water1'].unique() weather.drop('Water1', inplace=True, axis=1) # The `Depth` column has just two values namely 0 and `M` and the latter means missing. We remove this column. # In[33]: weather['Depth'].value_counts() weather['Depth'].nunique() weather['Depth'].unique() weather.drop('Depth', inplace=True, axis=1) # #### Converting dates into `datetime`: # In[34]: weather['Date'] = pd.to_datetime(weather['Date']) # In[35]: weather.to_csv('weather_new.csv') # ### Concerning stations 1 and 2 # # - As we saw above, there are two types of `Station`, namely, 1 and 2. # # #### From Kaggle's Website Weather Data: # # - Hot and dry conditions are more favorable for West Nile virus than cold and wet. # - We provide you with the dataset from NOAA of the weather conditions of 2007 to 2014, during the months of the tests. # # - Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level # - Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level # # - Each date had 2 records, 1 for each `Station=1` and other for `Station=2`. However as we shall see most missing values are in the latter which we will drop. # In[36]: weather['Station'].value_counts() weather['Station'].unique() # #### The `for` below searches each column for data that cannot be converted to numbers: # In[37]: cols_to_keep = ['Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', \ 'Cool', 'Sunrise', 'Sunset', 'SnowFall', \ 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed'] # In[38]: print('Columns with non-convertibles:\n') for station in [1,2]: print('Station',station,'\n') weather_station = weather[weather['Station']==station] for col in weather_station[cols_to_keep]: for x in sorted(weather_station[col].unique()): try: x = float(x) except: print(col,'| Non-convertibles, their frequency and their station:',\ (x,weather_station[weather_station[col] == x][col].count())) print("") # In[39]: weather.to_csv('weather_new_2.csv') # #### Indeed, as stated above, most missing values are in the station 2. We will there drop rows with `Station=2` # In[40]: weather = weather[weather['Station'] == 1] weather.dtypes weather['Station'].unique() del weather['Station'] # #### Only for station 1 we have: # In[41]: print('Columns with non-convertibles:\n') for col in weather[cols_to_keep]: for x in sorted(weather[col].unique()): try: x = float(x) except: print(col,'| Non-convertibles, their frequency and their station:', (x,weather[weather[col] == x][col].count())) # ### The strings 'T' and 'M' stand for trace and missing data. Traces are defined to be smaller that 0.05. Following cells take care of that: # In[42]: cols_with_M = ['WetBulb', 'StnPressure', 'SeaLevel'] for col in cols_with_M: weather[col] = weather[col].str.strip() weather[col] = weather[col].str.replace('M','0.0').astype(float) cols_with_T = ['SnowFall', 'PrecipTotal'] for col in cols_with_T: weather[col] = weather[col].str.replace(' T','0.05').astype(float) for col in cols_to_keep: weather[col] = weather[col].astype(float) # In[43]: weather.to_csv('weather_new_4.csv') # ### There are many zeros in the data # # In particular in the columns: # # cols_zeros = ['Heat','Cool','SnowFall'] # # there is a substantial quantity of zeros. We will drop these. # In[44]: weather = pd.read_csv('weather_new_4.csv',index_col=0) weather['Date'] = pd.to_datetime(weather['Date']) # In[45]: cols_zeros = ['Heat','Cool','SnowFall'] for col in cols_zeros: print('{}'.format(col),weather[weather[col] == 0.0][col].value_counts()/weather.shape[0]); # In[46]: for col in cols_zeros: weather.drop(col, inplace=True, axis=1) weather.head() # In[47]: weather.to_csv('weather_new_5.csv') # In[48]: weather = pd.read_csv('weather_new_5.csv',index_col=0) weather['Date'] = pd.to_datetime(weather['Date']) # ### `CodeSum` # # If `CodeSum` entries are letters, they indicate some significant weather event. We can dummify it. # # Let us use regex. We use `'^\w'` to match a string consisting of a single character where that character is alphanumeric (the '\w' means "any word character"), an underscore or an asterisk. # In[49]: weather['CodeSum'].str.strip() # strips empty spaces weather['CodeSum'][weather['CodeSum'].str.contains('^\w')] = '1' weather['CodeSum'][weather['CodeSum'] !='1'] = '0' # In[50]: weather['CodeSum']= weather['CodeSum'].astype(int) weather.dtypes # ### Sunset and sunrise are obviously correlated # In[51]: weather.drop('Sunrise', inplace=True, axis=1) # ## 5) Quick stop: `DataFrames` now # In[52]: train.isnull().any() test.isnull().any() spray.isnull().any() weather.isnull().any() # In[53]: train.head(2) train2.head(2) test.head(2) test2.head(2) spray.head(2) weather.head(2) # In[54]: train.dtypes train2.dtypes test.dtypes test2.dtypes weather.dtypes spray.dtypes # ## 6) Correlations and feature engineering # ### Train data # In[55]: for df in [train,test,spray,weather]: df.corr() # ### The temperatures are highly correlated and other features as well. Let's remove the extra baggage. # In[56]: weather.drop('Tmax', inplace=True, axis=1) weather.drop('Tmin', inplace=True, axis=1) weather.corr() # In[57]: weather.drop('WetBulb', inplace=True, axis=1) weather.drop('DewPoint', inplace=True, axis=1) weather.corr() # In[58]: sns.heatmap(weather.corr()) # In[59]: sns.heatmap(test.corr()) # In[60]: sns.heatmap(spray.corr()) # In[61]: sns.heatmap(weather.corr())