from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # see the value of multiple statements at once.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
weather = pd.read_csv('weather.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
spray = pd.read_csv('spray.csv')
#sample = pd.read_csv('sampleSubmission.csv')
def eda(df):
print("1) Are there missing values:")
if df.isnull().any().unique().shape[0] == 2:
if df.isnull().any().unique()[0] == False and df.isnull().any().unique()[1] == False:
print('No\n')
else:
print("Yes|Percentage of missing values in each column:\n",df.isnull().sum()/df.shape[0],'\n')
elif df.isnull().any().unique().shape[0] == 1:
if df.isnull().any().unique() == False:
print('No\n')
else:
print("Yes|Percentage of missing values in each column:\n",df.isnull().sum()/df.shape[0],'\n')
print("2) Which are the data types:\n")
print(df.dtypes,'\n')
print("3) Dataframe shape:",df.shape)
print("4) Unique values per columm")
for col in df.columns.tolist():
print (col,":",df[col].nunique())
print("5) Removing duplicates")
print('Initial shape:',df.shape)
df.groupby(df.columns.tolist()).size().reset_index().rename(columns={0:'count'}).sort_values('count',ascending=False).head()
df.drop_duplicates(inplace=True)
print('Shape after removing duplicates:',df.shape)
return
train
DataFrame
:¶train = pd.read_csv('train.csv')
train.head(2)
print("Number of duplicates:",train.duplicated().sum())
Date | Address | Species | Block | Street | Trap | AddressNumberAndStreet | Latitude | Longitude | AddressAccuracy | NumMosquitos | WnvPresent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007-05-29 | 4100 North Oak Park Avenue, Chicago, IL 60634,... | CULEX PIPIENS/RESTUANS | 41 | N OAK PARK AVE | T002 | 4100 N OAK PARK AVE, Chicago, IL | 41.95469 | -87.800991 | 9 | 1 | 0 |
1 | 2007-05-29 | 4100 North Oak Park Avenue, Chicago, IL 60634,... | CULEX RESTUANS | 41 | N OAK PARK AVE | T002 | 4100 N OAK PARK AVE, Chicago, IL | 41.95469 | -87.800991 | 9 | 1 | 0 |
Number of duplicates: 813
NumMosquitos
and WnvPresent
are not in the test set. I will remove the first since the number of mosquitos is less relevant than whether West Nile Virus was present in these mosquitos.print(train.columns.tolist())
['Date', 'Address', 'Species', 'Block', 'Street', 'Trap', 'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy', 'NumMosquitos', 'WnvPresent']
cols_to_keep = ['Date', 'Species', 'Trap','Latitude', 'Longitude', 'WnvPresent']
train = train[cols_to_keep]
train.head()
Date | Species | Trap | Latitude | Longitude | WnvPresent | |
---|---|---|---|---|---|---|
0 | 2007-05-29 | CULEX PIPIENS/RESTUANS | T002 | 41.954690 | -87.800991 | 0 |
1 | 2007-05-29 | CULEX RESTUANS | T002 | 41.954690 | -87.800991 | 0 |
2 | 2007-05-29 | CULEX RESTUANS | T007 | 41.994991 | -87.769279 | 0 |
3 | 2007-05-29 | CULEX PIPIENS/RESTUANS | T015 | 41.974089 | -87.824812 | 0 |
4 | 2007-05-29 | CULEX RESTUANS | T015 | 41.974089 | -87.824812 | 0 |
train.shape
train[train.duplicated(keep=False)].head() # examples of duplicates
train[train.duplicated(keep=False)].shape
(10506, 6)
Date | Species | Trap | Latitude | Longitude | WnvPresent | |
---|---|---|---|---|---|---|
98 | 2007-06-26 | CULEX PIPIENS/RESTUANS | T086 | 41.688324 | -87.676709 | 0 |
99 | 2007-06-26 | CULEX PIPIENS/RESTUANS | T086 | 41.688324 | -87.676709 | 0 |
280 | 2007-07-11 | CULEX RESTUANS | T015 | 41.974089 | -87.824812 | 0 |
281 | 2007-07-11 | CULEX RESTUANS | T015 | 41.974089 | -87.824812 | 0 |
293 | 2007-07-11 | CULEX PIPIENS/RESTUANS | T086 | 41.688324 | -87.676709 | 0 |
(2627, 6)
Using the argument df
equal to the training set train
. It removes duplicates among other things:
eda(train)
1) Are there missing values: No 2) Which are the data types: Date object Species object Trap object Latitude float64 Longitude float64 WnvPresent int64 dtype: object 3) Dataframe shape: (10506, 6) 4) Unique values per columm Date : 95 Species : 7 Trap : 136 Latitude : 138 Longitude : 138 WnvPresent : 2 5) Removing duplicates Initial shape: (10506, 6) Shape after removing duplicates: (8610, 6)
print("Number of duplicates:",train.duplicated().sum()) # Sanity check
Number of duplicates: 0
eda( )
Species
can be transformed into dummies. The others have too many unique values.value_counts
we find that the WnvPresent
column is highly unbalanced with $\approx$ 95$\%$ of zeros.round(100*train['WnvPresent'].value_counts()/train.shape[0],0)
0 95.0 1 5.0 Name: WnvPresent, dtype: float64
train.to_csv('train_new.csv')
Species
¶train = pd.read_csv('train_new.csv',index_col=0)
train['Date'] = pd.to_datetime(train['Date'])
train = pd.concat([train,pd.get_dummies(train['Species'], drop_first = True)], axis = 1)
train.drop('Species', inplace=True, axis=1)
train.head(2)
train.dtypes
Date | Trap | Latitude | Longitude | WnvPresent | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007-05-29 | T002 | 41.95469 | -87.800991 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 2007-05-29 | T002 | 41.95469 | -87.800991 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
Date datetime64[ns] Trap object Latitude float64 Longitude float64 WnvPresent int64 CULEX PIPIENS uint8 CULEX PIPIENS/RESTUANS uint8 CULEX RESTUANS uint8 CULEX SALINARIUS uint8 CULEX TARSALIS uint8 CULEX TERRITANS uint8 dtype: object
DataFrame
with the dates broken into pieces¶train2 = train.copy()
train2['Year']= train2.Date.dt.year
train2['DayofYear']= train2.Date.dt.dayofyear
train2.drop('Date', inplace=True, axis=1)
train2.head()
Trap | Latitude | Longitude | WnvPresent | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | Year | DayofYear | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | T002 | 41.954690 | -87.800991 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2007 | 149 |
1 | T002 | 41.954690 | -87.800991 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2007 | 149 |
2 | T007 | 41.994991 | -87.769279 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2007 | 149 |
3 | T015 | 41.974089 | -87.824812 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2007 | 149 |
4 | T015 | 41.974089 | -87.824812 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2007 | 149 |
train
and train2
after EDA¶The DataFrame
with the full Date
is kept because it may be useful for merging different dataframes. Hence:
'train_after_eda.csv'
has a Date
column'train_after_eda_without_date.csv'
has no Date
column but columns Year
and DayofYear
train.to_csv('train_after_eda.csv')
train2.to_csv('train_after_eda_without_date.csv')
cols_to_keep_test = ['Date', 'Species','Trap', 'Latitude', 'Longitude']
test = pd.read_csv('test.csv')
test['Date'] = pd.to_datetime(test['Date'])
test = test[cols_to_keep_test]
test.head()
print("Number of duplicates:",test.duplicated().sum())
Date | Species | Trap | Latitude | Longitude | |
---|---|---|---|---|---|
0 | 2008-06-11 | CULEX PIPIENS/RESTUANS | T002 | 41.95469 | -87.800991 |
1 | 2008-06-11 | CULEX RESTUANS | T002 | 41.95469 | -87.800991 |
2 | 2008-06-11 | CULEX PIPIENS | T002 | 41.95469 | -87.800991 |
3 | 2008-06-11 | CULEX SALINARIUS | T002 | 41.95469 | -87.800991 |
4 | 2008-06-11 | CULEX TERRITANS | T002 | 41.95469 | -87.800991 |
Number of duplicates: 1533
test = pd.concat([test,pd.get_dummies(test['Species'], drop_first = True)], axis = 1)
test.drop('Species', inplace=True, axis=1)
test.head()
Date | Trap | Latitude | Longitude | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | UNSPECIFIED CULEX | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2008-06-11 | T002 | 41.95469 | -87.800991 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1 | 2008-06-11 | T002 | 41.95469 | -87.800991 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 2008-06-11 | T002 | 41.95469 | -87.800991 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 2008-06-11 | T002 | 41.95469 | -87.800991 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 2008-06-11 | T002 | 41.95469 | -87.800991 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
DataFrame
with the dates broken into pieces¶test2 = test.copy()
test2['Year']= test2.Date.dt.year
test2['DayofYear']= test2.Date.dt.dayofyear
test2.drop('Date', inplace=True, axis=1)
test2.head()
Trap | Latitude | Longitude | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | UNSPECIFIED CULEX | Year | DayofYear | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | T002 | 41.95469 | -87.800991 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2008 | 163 |
1 | T002 | 41.95469 | -87.800991 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2008 | 163 |
2 | T002 | 41.95469 | -87.800991 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2008 | 163 |
3 | T002 | 41.95469 | -87.800991 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2008 | 163 |
4 | T002 | 41.95469 | -87.800991 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2008 | 163 |
test
and test2
after EDA¶test.to_csv('test_after_eda.csv')
test2.to_csv('test_after_eda_without_date.csv')
spray
data and perform similar steps:¶spray = pd.read_csv('spray.csv')
spray['Date'] = pd.to_datetime(spray['Date'])
spray.head()
spray.dtypes
Date | Time | Latitude | Longitude | |
---|---|---|---|---|
0 | 2011-08-29 | 6:56:58 PM | 42.391623 | -88.089163 |
1 | 2011-08-29 | 6:57:08 PM | 42.391348 | -88.089163 |
2 | 2011-08-29 | 6:57:18 PM | 42.391022 | -88.089157 |
3 | 2011-08-29 | 6:57:28 PM | 42.390637 | -88.089158 |
4 | 2011-08-29 | 6:57:38 PM | 42.390410 | -88.088858 |
Date datetime64[ns] Time object Latitude float64 Longitude float64 dtype: object
spray[spray.duplicated(keep=False)].head()
Date | Time | Latitude | Longitude | |
---|---|---|---|---|
484 | 2011-09-07 | 7:43:40 PM | 41.983917 | -87.793088 |
485 | 2011-09-07 | 7:43:40 PM | 41.983917 | -87.793088 |
489 | 2011-09-07 | 7:44:32 PM | 41.986460 | -87.794225 |
490 | 2011-09-07 | 7:44:32 PM | 41.986460 | -87.794225 |
491 | 2011-09-07 | 7:44:32 PM | 41.986460 | -87.794225 |
eda(spray)
1) Are there missing values: Yes|Percentage of missing values in each column: Date 0.000000 Time 0.039366 Latitude 0.000000 Longitude 0.000000 dtype: float64 2) Which are the data types: Date datetime64[ns] Time object Latitude float64 Longitude float64 dtype: object 3) Dataframe shape: (14835, 4) 4) Unique values per columm Date : 10 Time : 8583 Latitude : 12887 Longitude : 13007 5) Removing duplicates Initial shape: (14835, 4) Shape after removing duplicates: (14294, 4)
spray[spray.duplicated(keep=False)]
Date | Time | Latitude | Longitude |
---|
Indeed, printing out the DataFrame
we see that there are several NaNs
but the percentage is low.
spray.isnull().sum()
spray[spray['Time'].isnull()].head()
print('% of NaNs in the `Time` column:',round(spray[spray['Time'].isnull()].shape[0]/spray.shape[0],2))
Date 0 Time 584 Latitude 0 Longitude 0 dtype: int64
Date | Time | Latitude | Longitude | |
---|---|---|---|---|
1030 | 2011-09-07 | NaN | 41.987092 | -87.794286 |
1031 | 2011-09-07 | NaN | 41.987620 | -87.794382 |
1032 | 2011-09-07 | NaN | 41.988004 | -87.794574 |
1033 | 2011-09-07 | NaN | 41.988292 | -87.795486 |
1034 | 2011-09-07 | NaN | 41.988100 | -87.796014 |
% of NaNs in the `Time` column: 0.04
NaNs
or remove the column altogether. The second option seems to make more sense since time does look like a relevant variable¶#spray.dropna(inplace=True)
spray.drop('Time', inplace=True, axis=1)
spray.head()
Date | Latitude | Longitude | |
---|---|---|---|
0 | 2011-08-29 | 42.391623 | -88.089163 |
1 | 2011-08-29 | 42.391348 | -88.089163 |
2 | 2011-08-29 | 42.391022 | -88.089157 |
3 | 2011-08-29 | 42.390637 | -88.089158 |
4 | 2011-08-29 | 42.390410 | -88.088858 |
spray.isnull().sum() # Sanity check
Date 0 Latitude 0 Longitude 0 dtype: int64
spray2 = spray.copy()
spray2['Year']= spray2.Date.dt.year
spray2['DayofYear']= spray2.Date.dt.dayofyear
spray2.drop('Date', inplace=True, axis=1)
spray2.head()
Latitude | Longitude | Year | DayofYear | |
---|---|---|---|---|
0 | 42.391623 | -88.089163 | 2011 | 241 |
1 | 42.391348 | -88.089163 | 2011 | 241 |
2 | 42.391022 | -88.089157 | 2011 | 241 |
3 | 42.390637 | -88.089158 | 2011 | 241 |
4 | 42.390410 | -88.088858 | 2011 | 241 |
spray.to_csv('spray_after_eda.csv')
spray2.to_csv('spray_after_eda_without_date.csv')
weather
DataFrame
:¶weather = pd.read_csv('weather.csv')
DataFrame
of duplicates rows and its shape¶weather[weather.duplicated(keep=False)].head(2)
weather[weather.duplicated(keep=False)].shape[0]
Station | Date | Tmax | Tmin | Tavg | Depart | DewPoint | WetBulb | Heat | Cool | Sunrise | Sunset | CodeSum | Depth | Water1 | SnowFall | PrecipTotal | StnPressure | SeaLevel | ResultSpeed | ResultDir | AvgSpeed |
---|
0
eda(weather)
1) Are there missing values: No 2) Which are the data types: Station int64 Date object Tmax int64 Tmin int64 Tavg object Depart object DewPoint int64 WetBulb object Heat object Cool object Sunrise object Sunset object CodeSum object Depth object Water1 object SnowFall object PrecipTotal object StnPressure object SeaLevel object ResultSpeed float64 ResultDir int64 AvgSpeed object dtype: object 3) Dataframe shape: (2944, 22) 4) Unique values per columm Station : 2 Date : 1472 Tmax : 63 Tmin : 54 Tavg : 60 Depart : 42 DewPoint : 54 WetBulb : 48 Heat : 31 Cool : 31 Sunrise : 122 Sunset : 119 CodeSum : 98 Depth : 2 Water1 : 1 SnowFall : 4 PrecipTotal : 168 StnPressure : 104 SeaLevel : 102 ResultSpeed : 190 ResultDir : 36 AvgSpeed : 178 5) Removing duplicates Initial shape: (2944, 22) Shape after removing duplicates: (2944, 22)
The Water1
column has just 1 value namely M
and the latter means missing. We remove this column.
weather['Water1'].value_counts()
weather['Water1'].nunique()
weather['Water1'].unique()
weather.drop('Water1', inplace=True, axis=1)
M 2944 Name: Water1, dtype: int64
1
array(['M'], dtype=object)
The Depth
column has just two values namely 0 and M
and the latter means missing. We remove this column.
weather['Depth'].value_counts()
weather['Depth'].nunique()
weather['Depth'].unique()
weather.drop('Depth', inplace=True, axis=1)
M 1472 0 1472 Name: Depth, dtype: int64
2
array(['0', 'M'], dtype=object)
datetime
:¶weather['Date'] = pd.to_datetime(weather['Date'])
weather.to_csv('weather_new.csv')
Station
, namely, 1 and 2.Hot and dry conditions are more favorable for West Nile virus than cold and wet.
We provide you with the dataset from NOAA of the weather conditions of 2007 to 2014, during the months of the tests.
Each date had 2 records, 1 for each Station=1
and other for Station=2
. However as we shall see most missing values are in the latter which we will drop.
weather['Station'].value_counts()
weather['Station'].unique()
1 1472 2 1472 Name: Station, dtype: int64
array([1, 2])
for
below searches each column for data that cannot be converted to numbers:¶cols_to_keep = ['Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', \
'Cool', 'Sunrise', 'Sunset', 'SnowFall', \
'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed']
print('Columns with non-convertibles:\n')
for station in [1,2]:
print('Station',station,'\n')
weather_station = weather[weather['Station']==station]
for col in weather_station[cols_to_keep]:
for x in sorted(weather_station[col].unique()):
try:
x = float(x)
except:
print(col,'| Non-convertibles, their frequency and their station:',\
(x,weather_station[weather_station[col] == x][col].count()))
print("")
Columns with non-convertibles: Station 1 WetBulb | Non-convertibles, their frequency and their station: ('M', 3) SnowFall | Non-convertibles, their frequency and their station: (' T', 12) PrecipTotal | Non-convertibles, their frequency and their station: (' T', 163) StnPressure | Non-convertibles, their frequency and their station: ('M', 2) SeaLevel | Non-convertibles, their frequency and their station: ('M', 5) Station 2 Tavg | Non-convertibles, their frequency and their station: ('M', 11) Depart | Non-convertibles, their frequency and their station: ('M', 1472) WetBulb | Non-convertibles, their frequency and their station: ('M', 1) Heat | Non-convertibles, their frequency and their station: ('M', 11) Cool | Non-convertibles, their frequency and their station: ('M', 11) Sunrise | Non-convertibles, their frequency and their station: ('-', 1472) Sunset | Non-convertibles, their frequency and their station: ('-', 1472) SnowFall | Non-convertibles, their frequency and their station: ('M', 1472) PrecipTotal | Non-convertibles, their frequency and their station: (' T', 155) PrecipTotal | Non-convertibles, their frequency and their station: ('M', 2) StnPressure | Non-convertibles, their frequency and their station: ('M', 2) SeaLevel | Non-convertibles, their frequency and their station: ('M', 4) AvgSpeed | Non-convertibles, their frequency and their station: ('M', 3)
weather.to_csv('weather_new_2.csv')
Station=2
¶weather = weather[weather['Station'] == 1]
weather.dtypes
weather['Station'].unique()
del weather['Station']
Station int64 Date datetime64[ns] Tmax int64 Tmin int64 Tavg object Depart object DewPoint int64 WetBulb object Heat object Cool object Sunrise object Sunset object CodeSum object SnowFall object PrecipTotal object StnPressure object SeaLevel object ResultSpeed float64 ResultDir int64 AvgSpeed object dtype: object
array([1])
print('Columns with non-convertibles:\n')
for col in weather[cols_to_keep]:
for x in sorted(weather[col].unique()):
try:
x = float(x)
except:
print(col,'| Non-convertibles, their frequency and their station:',
(x,weather[weather[col] == x][col].count()))
Columns with non-convertibles: WetBulb | Non-convertibles, their frequency and their station: ('M', 3) SnowFall | Non-convertibles, their frequency and their station: (' T', 12) PrecipTotal | Non-convertibles, their frequency and their station: (' T', 163) StnPressure | Non-convertibles, their frequency and their station: ('M', 2) SeaLevel | Non-convertibles, their frequency and their station: ('M', 5)
cols_with_M = ['WetBulb', 'StnPressure', 'SeaLevel']
for col in cols_with_M:
weather[col] = weather[col].str.strip()
weather[col] = weather[col].str.replace('M','0.0').astype(float)
cols_with_T = ['SnowFall', 'PrecipTotal']
for col in cols_with_T:
weather[col] = weather[col].str.replace(' T','0.05').astype(float)
for col in cols_to_keep:
weather[col] = weather[col].astype(float)
weather.to_csv('weather_new_4.csv')
In particular in the columns:
cols_zeros = ['Heat','Cool','SnowFall']
there is a substantial quantity of zeros. We will drop these.
weather = pd.read_csv('weather_new_4.csv',index_col=0)
weather['Date'] = pd.to_datetime(weather['Date'])
cols_zeros = ['Heat','Cool','SnowFall']
for col in cols_zeros:
print('{}'.format(col),weather[weather[col] == 0.0][col].value_counts()/weather.shape[0]);
Heat 0.0 0.61413 Name: Heat, dtype: float64 Cool 0.0 0.415082 Name: Cool, dtype: float64 SnowFall 0.0 0.991168 Name: SnowFall, dtype: float64
for col in cols_zeros:
weather.drop(col, inplace=True, axis=1)
weather.head()
Date | Tmax | Tmin | Tavg | Depart | DewPoint | WetBulb | Sunrise | Sunset | CodeSum | PrecipTotal | StnPressure | SeaLevel | ResultSpeed | ResultDir | AvgSpeed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007-05-01 | 83.0 | 50.0 | 67.0 | 14.0 | 51.0 | 56.0 | 448.0 | 1849.0 | 0.00 | 29.10 | 29.82 | 1.7 | 27.0 | 9.2 | |
2 | 2007-05-02 | 59.0 | 42.0 | 51.0 | -3.0 | 42.0 | 47.0 | 447.0 | 1850.0 | BR | 0.00 | 29.38 | 30.09 | 13.0 | 4.0 | 13.4 |
4 | 2007-05-03 | 66.0 | 46.0 | 56.0 | 2.0 | 40.0 | 48.0 | 446.0 | 1851.0 | 0.00 | 29.39 | 30.12 | 11.7 | 7.0 | 11.9 | |
6 | 2007-05-04 | 66.0 | 49.0 | 58.0 | 4.0 | 41.0 | 50.0 | 444.0 | 1852.0 | RA | 0.05 | 29.31 | 30.05 | 10.4 | 8.0 | 10.8 |
8 | 2007-05-05 | 66.0 | 53.0 | 60.0 | 5.0 | 38.0 | 49.0 | 443.0 | 1853.0 | 0.05 | 29.40 | 30.10 | 11.7 | 7.0 | 12.0 |
weather.to_csv('weather_new_5.csv')
weather = pd.read_csv('weather_new_5.csv',index_col=0)
weather['Date'] = pd.to_datetime(weather['Date'])
CodeSum
¶If CodeSum
entries are letters, they indicate some significant weather event. We can dummify it.
Let us use regex. We use '^\w'
to match a string consisting of a single character where that character is alphanumeric (the '\w' means "any word character"), an underscore or an asterisk.
weather['CodeSum'].str.strip() # strips empty spaces
weather['CodeSum'][weather['CodeSum'].str.contains('^\w')] = '1'
weather['CodeSum'][weather['CodeSum'] !='1'] = '0'
0 2 BR 4 6 RA 8 10 12 RA 14 BR 16 BR HZ 18 BR 20 22 24 26 28 RA BR 30 RA BR 32 34 36 38 RA VCTS 40 42 44 46 RA HZ 48 50 TSRA RA BR HZ 52 RA BR 54 RA 56 BR HZ 58 HZ 60 RA HZ 62 TSRA BR HZ 64 RA BR 66 TSRA RA BR HZ 68 TSRA RA BR 70 RA BR 72 RA 74 76 78 80 82 84 86 88 90 92 RA HZ 94 HZ 96 TSRA RA HZ 98 RA 100 102 RA 104 RA 106 RA BR 108 BR HZ 110 BR HZ 112 TSRA BR HZ VCTS 114 TSRA 116 118 120 122 124 126 TSRA RA BR 128 TSRA BR HZ FU 130 BR HZ FU 132 134 136 138 TSRA RA VCTS 140 HZ VCTS 142 144 146 148 150 152 RA BR 154 RA BR VCTS 156 TSRA RA BR VCTS 158 DZ BR 160 162 164 166 168 170 172 TS TSRA RA BR HZ 174 RA BR HZ 176 BR HZ 178 180 182 184 HZ 186 HZ 188 190 RA 192 TSRA RA BR HZ 194 TSRA RA BR HZ 196 RA BR 198 TSRA RA BR 200 TSRA RA 202 BR 204 BR HZ 206 TSRA 208 BR 210 TS TSRA RA BR 212 TSRA RA BR 214 BR 216 218 RA 220 TS RA BR 222 TSRA RA BR 224 BR HZ 226 TS TSRA RA 228 TS TSRA RA BR VCTS 230 TSRA RA 232 RA 234 236 238 240 242 244 246 248 250 252 254 RA 256 RA HZ 258 TSRA RA BR HZ 260 262 264 RA BR 266 268 270 272 274 276 278 280 282 284 286 HZ 288 290 292 294 TS RA 296 RA 298 BR 300 302 304 TSRA BR 306 RA DZ FG+ BCFG BR 308 RA FG+ MIFG BR 310 312 HZ 314 HZ 316 318 320 322 324 RA DZ 326 328 330 332 RA 334 RA BR 336 RA DZ BR 338 RA HZ 340 RA BR 342 RA BR 344 346 348 RA DZ BR 350 352 354 RA 356 RA BR 358 RA BR 360 362 364 366 368 370 TSRA RA BR 372 RA 374 376 378 380 RA BR 382 384 386 388 RA BR 390 392 TSRA RA 394 DZ BR 396 398 400 RA 402 404 RA 406 408 410 412 414 416 TSRA 418 TSRA RA BR 420 422 424 RA 426 TSRA RA BR 428 430 432 RA 434 TSRA RA FG+ FG BR 436 TSRA DZ FG+ FG BR HZ 438 TSRA BR 440 RA 442 TS RA BR 444 TS TSRA RA 446 RA BR 448 RA 450 452 454 TSRA RA BR 456 458 RA 460 462 464 466 468 TSRA RA 470 472 TS TSRA 474 476 478 RA 480 BR 482 RA 484 TSRA HZ 486 RA BR 488 490 492 TS RA BR 494 496 498 500 502 TSRA RA 504 506 508 TSRA RA BR 510 TSRA RA BR 512 TS TSRA RA BR 514 516 518 520 522 524 RA 526 RA BR VCTS 528 TSRA 530 TSRA RA 532 534 536 538 540 BR 542 544 546 TSRA RA 548 550 RA BR 552 RA 554 556 558 TS TSRA RA BR 560 HZ 562 564 566 568 570 572 574 576 RA 578 BR HZ 580 582 584 586 588 590 592 RA 594 TSRA BR HZ 596 BR 598 600 602 604 606 RA BR 608 RA BR 610 612 614 616 TS 618 TSRA 620 RA BR 622 624 BR 626 628 RA BR 630 632 634 RA BR 636 RA DZ BR 638 RA BR 640 RA BR 642 644 646 648 650 652 654 FG+ BR HZ 656 BR HZ 658 660 TS TSRA BR HZ 662 BR HZ 664 BR 666 BR 668 670 RA BR 672 674 676 678 RA 680 682 RA 684 686 RA BR 688 RA BR 690 692 694 696 BR 698 700 RA 702 RA BR 704 706 708 RA BR 710 712 RA BR 714 716 718 RA 720 TSRA RA BR 722 RA DZ BR 724 RA 726 728 730 732 734 736 RA 738 740 742 744 746 RA BR 748 RA BR HZ 750 TSRA RA 752 754 756 758 760 TSRA RA BR 762 764 RA BR 766 BR 768 770 772 774 776 778 780 782 784 RA DZ 786 TSRA RA BR 788 DZ BR HZ 790 792 794 RA 796 798 TSRA RA 800 RA BR 802 804 806 808 810 BR 812 TSRA RA BR 814 DZ BR 816 RA 818 RA HZ 820 RA 822 RA BR 824 826 828 RA BR 830 BR HZ 832 RA BR 834 TS TSRA BR 836 838 840 TSRA RA 842 844 TSRA 846 848 850 TS RA 852 854 856 858 RA DZ 860 862 864 RA BR FU 866 RA BR HZ 868 870 RA 872 RA 874 876 RA BR 878 TSRA BR HZ 880 882 884 886 BR 888 890 892 894 896 898 RA DZ 900 BR HZ 902 TS BR HZ 904 TSRA RA 906 908 RA 910 RA 912 RA 914 916 918 920 RA 922 924 926 BR 928 930 932 RA BR 934 RA BR 936 938 940 942 944 946 948 950 TSRA 952 TSRA RA BR 954 956 RA 958 RA 960 RA 962 964 966 968 970 RA BR 972 RA BR 974 RA BR 976 RA DZ BR 978 980 982 984 986 988 BR 990 BR HZ 992 RA BR HZ 994 BR 996 998 BR HZ 1000 BR HZ 1002 BR 1004 FG+ BR 1006 FG+ MIFG BR 1008 1010 1012 1014 1016 BR 1018 1020 RA DZ BR 1022 DZ BR HZ 1024 BR HZ 1026 BR 1028 BR 1030 RA DZ BR 1032 BR 1034 TSRA RA FG BR 1036 RA 1038 1040 1042 RA BR 1044 RA BR 1046 RA DZ BR 1048 1050 1052 RA DZ BR 1054 1056 RA DZ BR 1058 RA BR 1060 1062 1064 RA DZ BR 1066 1068 RA DZ BR 1070 RA DZ BR 1072 RA 1074 DZ 1076 1078 1080 RA 1082 RA 1084 RA BR 1086 RA DZ BR 1088 RA DZ BR 1090 RA BR 1092 RA BR 1094 RA BR 1096 BR HZ 1098 RA BR HZ 1100 RA DZ BR 1102 1104 RA BR 1106 RA BR 1108 1110 1112 1114 TSRA 1116 TSRA RA BR 1118 RA 1120 1122 RA 1124 RA DZ BR 1126 TSRA RA FG BR 1128 TSRA RA BR 1130 1132 1134 1136 RA 1138 1140 1142 RA 1144 RA DZ BR HZ 1146 FG+ BR HZ 1148 1150 1152 1154 1156 1158 1160 1162 1164 TSRA RA BR 1166 BR HZ 1168 TSRA RA BR 1170 1172 1174 RA BR 1176 RA 1178 1180 RA BR 1182 BR 1184 1186 TS RA 1188 TSRA RA BR 1190 RA BR 1192 RA BR 1194 RA BR 1196 1198 1200 TSRA BR 1202 1204 1206 TSRA BR 1208 1210 TS TSRA RA BR 1212 1214 1216 TS TSRA 1218 1220 1222 1224 1226 1228 1230 1232 1234 1236 TSRA RA 1238 BR 1240 TSRA RA BR 1242 1244 1246 TSRA RA BR 1248 BR 1250 1252 BR 1254 1256 1258 1260 1262 RA 1264 1266 1268 1270 TS TSRA RA BR 1272 TSRA RA BR 1274 1276 1278 1280 BR 1282 1284 RA 1286 TS TSRA RA BR 1288 BR HZ 1290 RA BR 1292 TSRA RA BR 1294 TSRA RA BR 1296 1298 1300 1302 RA BR 1304 TS 1306 1308 BR 1310 1312 TS BR 1314 1316 1318 1320 RA 1322 RA 1324 1326 RA 1328 RA BR 1330 1332 1334 1336 1338 1340 1342 1344 1346 1348 1350 RA 1352 RA BR 1354 1356 1358 RA 1360 1362 1364 1366 1368 1370 RA BR 1372 1374 1376 1378 1380 RA 1382 1384 TSRA BR 1386 RA 1388 1390 TSRA RA 1392 1394 1396 RA BR 1398 1400 RA 1402 1404 1406 1408 1410 RA BR 1412 RA 1414 RA 1416 1418 1420 1422 1424 1426 1428 1430 1432 1434 TSRA RA 1436 1438 1440 BR 1442 1444 1446 1448 1450 1452 1454 RA BR 1456 RA BR 1458 BR 1460 RA BR 1462 1464 1466 1468 1470 1472 1474 1476 RA 1478 1480 RA BR 1482 RA 1484 1486 1488 RA 1490 1492 TS TSRA BR HZ 1494 TS TSRA BR 1496 RA BR 1498 RA BR 1500 RA BR 1502 1504 1506 RA DZ BR 1508 BR HZ 1510 BR 1512 BR 1514 TSRA RA BR 1516 1518 1520 TSRA RA BR 1522 RA BR 1524 RA 1526 RA BR 1528 TS RA FG+ FG BR 1530 1532 TS 1534 1536 RA 1538 1540 TSRA 1542 1544 1546 1548 TSRA 1550 TSRA RA BR 1552 RA BR HZ 1554 RA DZ FG+ FG BR 1556 BR 1558 1560 1562 TSRA RA BR 1564 RA BR 1566 1568 1570 1572 TSRA RA BR 1574 RA BR SQ 1576 RA 1578 RA BR 1580 1582 1584 1586 RA HZ 1588 1590 1592 1594 1596 BR 1598 1600 1602 1604 1606 RA 1608 1610 1612 HZ 1614 HZ 1616 RA 1618 1620 1622 1624 RA 1626 1628 RA HZ 1630 1632 BR HZ 1634 TS 1636 TS TSRA RA BR 1638 TS TSRA RA BR 1640 TS TSRA BR 1642 1644 1646 TS TSRA 1648 TSRA RA BR 1650 BR 1652 1654 1656 1658 1660 1662 1664 RA 1666 RA BR 1668 TSRA BR 1670 RA BR 1672 BR 1674 1676 1678 1680 RA 1682 RA 1684 1686 1688 1690 1692 1694 RA FG BR 1696 RA 1698 1700 TSRA BR 1702 BR 1704 1706 1708 1710 1712 1714 RA 1716 RA 1718 1720 HZ 1722 RA BR HZ 1724 1726 1728 1730 1732 RA 1734 RA 1736 BR 1738 BR 1740 1742 BR HZ FU 1744 RA BR HZ FU 1746 1748 1750 1752 RA DZ BR 1754 RA BR 1756 MIFG BCFG BR 1758 RA 1760 1762 1764 RA BR 1766 RA 1768 RA BR 1770 RA BR 1772 RA BR 1774 RA BR 1776 RA 1778 1780 1782 1784 1786 1788 1790 1792 1794 1796 1798 1800 1802 RA BR 1804 1806 1808 RA 1810 1812 RA 1814 RA BR 1816 RA 1818 1820 1822 TSRA RA BR 1824 1826 RA 1828 1830 RA 1832 1834 1836 RA BR 1838 BR 1840 RA BR HZ 1842 FG+ BR 1844 TS TSRA 1846 TSRA BR HZ 1848 RA BR HZ 1850 TS TSRA RA BR 1852 TSRA RA FG+ FG BR HZ 1854 BR 1856 RA 1858 1860 1862 RA 1864 1866 1868 TS RA 1870 1872 1874 1876 1878 TSRA RA 1880 1882 1884 1886 1888 1890 RA 1892 1894 1896 1898 1900 RA BR 1902 RA 1904 1906 1908 1910 1912 1914 1916 1918 1920 1922 RA 1924 1926 1928 1930 1932 RA BR 1934 RA BR 1936 1938 1940 1942 RA 1944 1946 1948 1950 1952 1954 1956 TSRA HZ 1958 TS TSRA RA 1960 1962 1964 BR HZ 1966 1968 1970 TSRA 1972 1974 1976 1978 1980 1982 1984 1986 TS TSRA RA BR 1988 1990 1992 1994 1996 TS TSRA RA BR 1998 TSRA RA BR 2000 2002 2004 2006 RA 2008 TSRA RA 2010 RA 2012 TSRA RA 2014 2016 2018 RA 2020 2022 2024 2026 2028 BR 2030 TS TSRA 2032 2034 2036 2038 RA 2040 RA BR 2042 TSRA RA BR 2044 2046 2048 RA BR 2050 BR 2052 2054 TSRA BR 2056 2058 2060 2062 TSRA RA 2064 2066 2068 RA 2070 2072 2074 RA BR 2076 BR 2078 2080 2082 2084 2086 RA 2088 2090 BR 2092 TSRA RA BR 2094 RA 2096 2098 RA BR 2100 RA 2102 2104 2106 2108 2110 RA 2112 2114 2116 2118 TSRA RA BR 2120 2122 2124 2126 RA 2128 RA 2130 2132 2134 2136 2138 2140 RA 2142 2144 2146 2148 2150 RA DZ BR 2152 RA 2154 2156 2158 2160 2162 RA BR 2164 2166 RA 2168 2170 RA DZ BR 2172 TSRA RA BR 2174 2176 2178 RA BR 2180 RA 2182 RA DZ 2184 2186 2188 TSRA RA BR 2190 RA BR 2192 2194 RA 2196 2198 2200 2202 2204 2206 2208 2210 RA 2212 RA BR 2214 2216 RA 2218 2220 2222 2224 RA 2226 RA DZ BR 2228 2230 2232 2234 2236 2238 2240 2242 2244 BR HZ 2246 TSRA RA BR 2248 RA BR 2250 TS RA BR 2252 RA BR 2254 2256 2258 RA 2260 TSRA DZ BR HZ 2262 TSRA RA BR 2264 2266 TSRA RA 2268 RA BR 2270 RA BR 2272 RA DZ BR 2274 2276 2278 RA 2280 RA 2282 2284 2286 RA 2288 TSRA RA BR HZ 2290 RA BR 2292 TS TSRA BR 2294 BR 2296 2298 RA BR 2300 2302 2304 BR 2306 2308 2310 RA BR 2312 RA 2314 RA 2316 BR 2318 TSRA 2320 TSRA RA BR 2322 BR 2324 RA 2326 BR 2328 2330 2332 2334 DZ BR HZ 2336 BR 2338 2340 2342 2344 TSRA RA BR 2346 BR 2348 2350 2352 2354 2356 2358 2360 2362 2364 2366 TS BR 2368 2370 2372 TSRA RA 2374 2376 2378 2380 RA BR 2382 2384 RA 2386 2388 RA 2390 RA DZ BR 2392 2394 RA BR 2396 2398 RA 2400 RA 2402 2404 2406 2408 2410 2412 RA 2414 RA DZ 2416 2418 2420 2422 2424 2426 2428 2430 2432 RA 2434 RA 2436 BR 2438 2440 2442 2444 2446 2448 BR HZ 2450 BR 2452 TS TSRA 2454 TS 2456 2458 2460 2462 2464 2466 2468 2470 2472 2474 2476 RA 2478 2480 2482 RA BR 2484 2486 2488 2490 TS TSRA RA FG BR HZ 2492 2494 2496 2498 2500 2502 2504 BR 2506 2508 RA BR 2510 RA BR 2512 2514 BR 2516 BR 2518 RA DZ FG+ BR 2520 RA BR 2522 TSRA BR 2524 RA 2526 2528 2530 2532 BR 2534 2536 RA BR 2538 2540 2542 RA BR 2544 DZ 2546 RA BR 2548 2550 RA 2552 RA 2554 2556 RA 2558 2560 2562 2564 2566 2568 2570 2572 RA BR HZ 2574 RA BR 2576 RA DZ BR 2578 2580 2582 2584 RA 2586 2588 2590 RA 2592 RA BR 2594 2596 2598 TS TSRA BR 2600 RA BR 2602 RA BR 2604 RA 2606 RA DZ 2608 BR 2610 2612 RA 2614 TS TSRA RA BR 2616 RA BR HZ 2618 2620 2622 2624 2626 2628 RA BR 2630 RA BR 2632 2634 2636 2638 2640 RA 2642 2644 RA BR 2646 2648 2650 RA 2652 RA 2654 2656 RA BR 2658 RA DZ BR 2660 DZ BR 2662 2664 2666 2668 2670 RA 2672 TS TSRA BR HZ 2674 TSRA FG+ BR HZ 2676 BR HZ 2678 RA BR 2680 FG+ BR HZ 2682 RA BR 2684 TSRA RA 2686 FG+ FG BR HZ 2688 BR HZ 2690 FG BR HZ 2692 2694 RA 2696 TS TSRA RA 2698 RA 2700 RA DZ BR HZ 2702 RA 2704 HZ 2706 RA HZ 2708 2710 TSRA 2712 TSRA 2714 2716 2718 RA 2720 RA BR 2722 2724 2726 RA 2728 2730 2732 2734 HZ 2736 2738 2740 TS 2742 2744 2746 2748 2750 RA BR HZ 2752 2754 2756 2758 2760 2762 2764 2766 TSRA RA BR 2768 BR HZ 2770 2772 2774 2776 2778 2780 TSRA RA BR 2782 RA 2784 2786 2788 2790 2792 DZ BR HZ 2794 BR HZ 2796 RA 2798 2800 TSRA RA BR 2802 TSRA RA BR 2804 TS RA BR HZ 2806 2808 TSRA BR 2810 2812 2814 RA 2816 2818 2820 BR HZ 2822 RA 2824 RA 2826 2828 RA 2830 TS RA 2832 2834 2836 2838 RA 2840 RA BR 2842 2844 RA BR 2846 2848 2850 RA 2852 2854 2856 BCFG BR 2858 BR 2860 RA 2862 RA 2864 2866 2868 2870 2872 BR 2874 2876 BR HZ 2878 RA BR 2880 DZ BR 2882 2884 TSRA RA BR 2886 RA 2888 RA DZ BR 2890 RA 2892 2894 2896 2898 2900 2902 2904 RA DZ BR 2906 RA DZ BR 2908 RA DZ BR 2910 RA DZ BR 2912 DZ BR 2914 2916 RA DZ 2918 2920 2922 2924 2926 BR 2928 2930 BR 2932 2934 2936 2938 2940 2942 RA SN Name: CodeSum, dtype: object
/Users/coreygirard/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from ipykernel import kernelapp as app /Users/coreygirard/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance()
weather['CodeSum']= weather['CodeSum'].astype(int)
weather.dtypes
Date datetime64[ns] Tmax float64 Tmin float64 Tavg float64 Depart float64 DewPoint float64 WetBulb float64 Sunrise float64 Sunset float64 CodeSum int64 PrecipTotal float64 StnPressure float64 SeaLevel float64 ResultSpeed float64 ResultDir float64 AvgSpeed float64 dtype: object
weather.drop('Sunrise', inplace=True, axis=1)
DataFrames
now¶train.isnull().any()
test.isnull().any()
spray.isnull().any()
weather.isnull().any()
Date False Trap False Latitude False Longitude False WnvPresent False CULEX PIPIENS False CULEX PIPIENS/RESTUANS False CULEX RESTUANS False CULEX SALINARIUS False CULEX TARSALIS False CULEX TERRITANS False dtype: bool
Date False Trap False Latitude False Longitude False CULEX PIPIENS False CULEX PIPIENS/RESTUANS False CULEX RESTUANS False CULEX SALINARIUS False CULEX TARSALIS False CULEX TERRITANS False UNSPECIFIED CULEX False dtype: bool
Date False Latitude False Longitude False dtype: bool
Date False Tmax False Tmin False Tavg False Depart False DewPoint False WetBulb False Sunset False CodeSum False PrecipTotal False StnPressure False SeaLevel False ResultSpeed False ResultDir False AvgSpeed False dtype: bool
train.head(2)
train2.head(2)
test.head(2)
test2.head(2)
spray.head(2)
weather.head(2)
Date | Trap | Latitude | Longitude | WnvPresent | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007-05-29 | T002 | 41.95469 | -87.800991 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 2007-05-29 | T002 | 41.95469 | -87.800991 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
Trap | Latitude | Longitude | WnvPresent | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | Year | DayofYear | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | T002 | 41.95469 | -87.800991 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2007 | 149 |
1 | T002 | 41.95469 | -87.800991 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2007 | 149 |
Date | Trap | Latitude | Longitude | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | UNSPECIFIED CULEX | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2008-06-11 | T002 | 41.95469 | -87.800991 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1 | 2008-06-11 | T002 | 41.95469 | -87.800991 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
Trap | Latitude | Longitude | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | UNSPECIFIED CULEX | Year | DayofYear | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | T002 | 41.95469 | -87.800991 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2008 | 163 |
1 | T002 | 41.95469 | -87.800991 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2008 | 163 |
Date | Latitude | Longitude | |
---|---|---|---|
0 | 2011-08-29 | 42.391623 | -88.089163 |
1 | 2011-08-29 | 42.391348 | -88.089163 |
Date | Tmax | Tmin | Tavg | Depart | DewPoint | WetBulb | Sunset | CodeSum | PrecipTotal | StnPressure | SeaLevel | ResultSpeed | ResultDir | AvgSpeed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007-05-01 | 83.0 | 50.0 | 67.0 | 14.0 | 51.0 | 56.0 | 1849.0 | 0 | 0.0 | 29.10 | 29.82 | 1.7 | 27.0 | 9.2 |
2 | 2007-05-02 | 59.0 | 42.0 | 51.0 | -3.0 | 42.0 | 47.0 | 1850.0 | 1 | 0.0 | 29.38 | 30.09 | 13.0 | 4.0 | 13.4 |
train.dtypes
train2.dtypes
test.dtypes
test2.dtypes
weather.dtypes
spray.dtypes
Date datetime64[ns] Trap object Latitude float64 Longitude float64 WnvPresent int64 CULEX PIPIENS uint8 CULEX PIPIENS/RESTUANS uint8 CULEX RESTUANS uint8 CULEX SALINARIUS uint8 CULEX TARSALIS uint8 CULEX TERRITANS uint8 dtype: object
Trap object Latitude float64 Longitude float64 WnvPresent int64 CULEX PIPIENS uint8 CULEX PIPIENS/RESTUANS uint8 CULEX RESTUANS uint8 CULEX SALINARIUS uint8 CULEX TARSALIS uint8 CULEX TERRITANS uint8 Year int64 DayofYear int64 dtype: object
Date datetime64[ns] Trap object Latitude float64 Longitude float64 CULEX PIPIENS uint8 CULEX PIPIENS/RESTUANS uint8 CULEX RESTUANS uint8 CULEX SALINARIUS uint8 CULEX TARSALIS uint8 CULEX TERRITANS uint8 UNSPECIFIED CULEX uint8 dtype: object
Trap object Latitude float64 Longitude float64 CULEX PIPIENS uint8 CULEX PIPIENS/RESTUANS uint8 CULEX RESTUANS uint8 CULEX SALINARIUS uint8 CULEX TARSALIS uint8 CULEX TERRITANS uint8 UNSPECIFIED CULEX uint8 Year int64 DayofYear int64 dtype: object
Date datetime64[ns] Tmax float64 Tmin float64 Tavg float64 Depart float64 DewPoint float64 WetBulb float64 Sunset float64 CodeSum int64 PrecipTotal float64 StnPressure float64 SeaLevel float64 ResultSpeed float64 ResultDir float64 AvgSpeed float64 dtype: object
Date datetime64[ns] Latitude float64 Longitude float64 dtype: object
for df in [train,test,spray,weather]:
df.corr()
Latitude | Longitude | WnvPresent | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | |
---|---|---|---|---|---|---|---|---|---|
Latitude | 1.000000 | -0.640108 | 0.030907 | -0.041452 | 0.027364 | 0.036295 | -0.022753 | -0.006511 | -0.065012 |
Longitude | -0.640108 | 1.000000 | -0.074075 | 0.018343 | -0.006192 | -0.035469 | 0.009949 | 0.010471 | 0.065747 |
WnvPresent | 0.030907 | -0.074075 | 1.000000 | 0.095826 | 0.018785 | -0.092170 | -0.023358 | -0.006252 | -0.037979 |
CULEX PIPIENS | -0.041452 | 0.018343 | 0.095826 | 1.000000 | -0.499768 | -0.342402 | -0.054199 | -0.014507 | -0.088123 |
CULEX PIPIENS/RESTUANS | 0.027364 | -0.006192 | 0.018785 | -0.499768 | 1.000000 | -0.567032 | -0.089755 | -0.024024 | -0.145936 |
CULEX RESTUANS | 0.036295 | -0.035469 | -0.092170 | -0.342402 | -0.567032 | 1.000000 | -0.061493 | -0.016459 | -0.099984 |
CULEX SALINARIUS | -0.022753 | 0.009949 | -0.023358 | -0.054199 | -0.089755 | -0.061493 | 1.000000 | -0.002605 | -0.015826 |
CULEX TARSALIS | -0.006511 | 0.010471 | -0.006252 | -0.014507 | -0.024024 | -0.016459 | -0.002605 | 1.000000 | -0.004236 |
CULEX TERRITANS | -0.065012 | 0.065747 | -0.037979 | -0.088123 | -0.145936 | -0.099984 | -0.015826 | -0.004236 | 1.000000 |
Latitude | Longitude | CULEX PIPIENS | CULEX PIPIENS/RESTUANS | CULEX RESTUANS | CULEX SALINARIUS | CULEX TARSALIS | CULEX TERRITANS | UNSPECIFIED CULEX | |
---|---|---|---|---|---|---|---|---|---|
Latitude | 1.000000 | -0.648909 | 0.000740 | 0.013808 | 0.003677 | -0.003519 | -0.003766 | -0.003731 | -0.003828 |
Longitude | -0.648909 | 1.000000 | -0.002649 | -0.022707 | -0.007081 | 0.006192 | 0.006704 | 0.006626 | 0.006832 |
CULEX PIPIENS | 0.000740 | -0.002649 | 1.000000 | -0.147349 | -0.143517 | -0.141748 | -0.141703 | -0.141726 | -0.141692 |
CULEX PIPIENS/RESTUANS | 0.013808 | -0.022707 | -0.147349 | 1.000000 | -0.148211 | -0.146385 | -0.146338 | -0.146362 | -0.146327 |
CULEX RESTUANS | 0.003677 | -0.007081 | -0.143517 | -0.148211 | 1.000000 | -0.142578 | -0.142533 | -0.142555 | -0.142521 |
CULEX SALINARIUS | -0.003519 | 0.006192 | -0.141748 | -0.146385 | -0.142578 | 1.000000 | -0.140776 | -0.140799 | -0.140765 |
CULEX TARSALIS | -0.003766 | 0.006704 | -0.141703 | -0.146338 | -0.142533 | -0.140776 | 1.000000 | -0.140754 | -0.140720 |
CULEX TERRITANS | -0.003731 | 0.006626 | -0.141726 | -0.146362 | -0.142555 | -0.140799 | -0.140754 | 1.000000 | -0.140743 |
UNSPECIFIED CULEX | -0.003828 | 0.006832 | -0.141692 | -0.146327 | -0.142521 | -0.140765 | -0.140720 | -0.140743 | 1.000000 |
Latitude | Longitude | |
---|---|---|
Latitude | 1.000000 | -0.761842 |
Longitude | -0.761842 | 1.000000 |
Tmax | Tmin | Tavg | Depart | DewPoint | WetBulb | Sunset | CodeSum | PrecipTotal | StnPressure | SeaLevel | ResultSpeed | ResultDir | AvgSpeed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Tmax | 1.000000 | 0.851965 | 0.966207 | 0.683311 | 0.803170 | 0.862516 | 0.558433 | -0.037853 | 0.039866 | -0.045456 | -0.030421 | -0.187049 | 0.034667 | -0.171515 |
Tmin | 0.851965 | 1.000000 | 0.957567 | 0.578054 | 0.913338 | 0.906754 | 0.563374 | 0.120111 | 0.127879 | -0.068127 | -0.039539 | -0.112225 | 0.009511 | -0.094538 |
Tavg | 0.966207 | 0.957567 | 1.000000 | 0.658357 | 0.888062 | 0.917152 | 0.582808 | 0.037840 | 0.084432 | -0.059296 | -0.036384 | -0.158167 | 0.024257 | -0.140962 |
Depart | 0.683311 | 0.578054 | 0.658357 | 1.000000 | 0.554093 | 0.589544 | 0.027630 | 0.079295 | 0.064817 | -0.046830 | -0.032197 | 0.020585 | 0.107721 | 0.058628 |
DewPoint | 0.803170 | 0.913338 | 0.888062 | 0.554093 | 1.000000 | 0.928693 | 0.470062 | 0.273803 | 0.227942 | -0.060867 | -0.064712 | -0.191599 | 0.013438 | -0.159271 |
WetBulb | 0.862516 | 0.906754 | 0.917152 | 0.589544 | 0.928693 | 1.000000 | 0.513161 | 0.160113 | 0.164530 | 0.186177 | -0.054496 | -0.165417 | 0.025540 | -0.141923 |
Sunset | 0.558433 | 0.563374 | 0.582808 | 0.027630 | 0.470062 | 0.513161 | 1.000000 | 0.015761 | 0.065072 | -0.048271 | -0.002016 | -0.100129 | -0.127182 | -0.074539 |
CodeSum | -0.037853 | 0.120111 | 0.037840 | 0.079295 | 0.273803 | 0.160113 | 0.015761 | 1.000000 | 0.343029 | -0.014655 | -0.093113 | 0.002478 | 0.045178 | 0.095038 |
PrecipTotal | 0.039866 | 0.127879 | 0.084432 | 0.064817 | 0.227942 | 0.164530 | 0.065072 | 0.343029 | 1.000000 | -0.024844 | -0.007776 | -0.027002 | 0.056042 | 0.095667 |
StnPressure | -0.045456 | -0.068127 | -0.059296 | -0.046830 | -0.060867 | 0.186177 | -0.048271 | -0.014655 | -0.024844 | 1.000000 | 0.011190 | -0.020700 | 0.013248 | -0.031916 |
SeaLevel | -0.030421 | -0.039539 | -0.036384 | -0.032197 | -0.064712 | -0.054496 | -0.002016 | -0.093113 | -0.007776 | 0.011190 | 1.000000 | -0.028038 | 0.027229 | -0.007692 |
ResultSpeed | -0.187049 | -0.112225 | -0.158167 | 0.020585 | -0.191599 | -0.165417 | -0.100129 | 0.002478 | -0.027002 | -0.020700 | -0.028038 | 1.000000 | 0.094227 | 0.911711 |
ResultDir | 0.034667 | 0.009511 | 0.024257 | 0.107721 | 0.013438 | 0.025540 | -0.127182 | 0.045178 | 0.056042 | 0.013248 | 0.027229 | 0.094227 | 1.000000 | 0.154508 |
AvgSpeed | -0.171515 | -0.094538 | -0.140962 | 0.058628 | -0.159271 | -0.141923 | -0.074539 | 0.095038 | 0.095667 | -0.031916 | -0.007692 | 0.911711 | 0.154508 | 1.000000 |
weather.drop('Tmax', inplace=True, axis=1)
weather.drop('Tmin', inplace=True, axis=1)
weather.corr()
Tavg | Depart | DewPoint | WetBulb | Sunset | CodeSum | PrecipTotal | StnPressure | SeaLevel | ResultSpeed | ResultDir | AvgSpeed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Tavg | 1.000000 | 0.658357 | 0.888062 | 0.917152 | 0.582808 | 0.037840 | 0.084432 | -0.059296 | -0.036384 | -0.158167 | 0.024257 | -0.140962 |
Depart | 0.658357 | 1.000000 | 0.554093 | 0.589544 | 0.027630 | 0.079295 | 0.064817 | -0.046830 | -0.032197 | 0.020585 | 0.107721 | 0.058628 |
DewPoint | 0.888062 | 0.554093 | 1.000000 | 0.928693 | 0.470062 | 0.273803 | 0.227942 | -0.060867 | -0.064712 | -0.191599 | 0.013438 | -0.159271 |
WetBulb | 0.917152 | 0.589544 | 0.928693 | 1.000000 | 0.513161 | 0.160113 | 0.164530 | 0.186177 | -0.054496 | -0.165417 | 0.025540 | -0.141923 |
Sunset | 0.582808 | 0.027630 | 0.470062 | 0.513161 | 1.000000 | 0.015761 | 0.065072 | -0.048271 | -0.002016 | -0.100129 | -0.127182 | -0.074539 |
CodeSum | 0.037840 | 0.079295 | 0.273803 | 0.160113 | 0.015761 | 1.000000 | 0.343029 | -0.014655 | -0.093113 | 0.002478 | 0.045178 | 0.095038 |
PrecipTotal | 0.084432 | 0.064817 | 0.227942 | 0.164530 | 0.065072 | 0.343029 | 1.000000 | -0.024844 | -0.007776 | -0.027002 | 0.056042 | 0.095667 |
StnPressure | -0.059296 | -0.046830 | -0.060867 | 0.186177 | -0.048271 | -0.014655 | -0.024844 | 1.000000 | 0.011190 | -0.020700 | 0.013248 | -0.031916 |
SeaLevel | -0.036384 | -0.032197 | -0.064712 | -0.054496 | -0.002016 | -0.093113 | -0.007776 | 0.011190 | 1.000000 | -0.028038 | 0.027229 | -0.007692 |
ResultSpeed | -0.158167 | 0.020585 | -0.191599 | -0.165417 | -0.100129 | 0.002478 | -0.027002 | -0.020700 | -0.028038 | 1.000000 | 0.094227 | 0.911711 |
ResultDir | 0.024257 | 0.107721 | 0.013438 | 0.025540 | -0.127182 | 0.045178 | 0.056042 | 0.013248 | 0.027229 | 0.094227 | 1.000000 | 0.154508 |
AvgSpeed | -0.140962 | 0.058628 | -0.159271 | -0.141923 | -0.074539 | 0.095038 | 0.095667 | -0.031916 | -0.007692 | 0.911711 | 0.154508 | 1.000000 |
weather.drop('WetBulb', inplace=True, axis=1)
weather.drop('DewPoint', inplace=True, axis=1)
weather.corr()
Tavg | Depart | Sunset | CodeSum | PrecipTotal | StnPressure | SeaLevel | ResultSpeed | ResultDir | AvgSpeed | |
---|---|---|---|---|---|---|---|---|---|---|
Tavg | 1.000000 | 0.658357 | 0.582808 | 0.037840 | 0.084432 | -0.059296 | -0.036384 | -0.158167 | 0.024257 | -0.140962 |
Depart | 0.658357 | 1.000000 | 0.027630 | 0.079295 | 0.064817 | -0.046830 | -0.032197 | 0.020585 | 0.107721 | 0.058628 |
Sunset | 0.582808 | 0.027630 | 1.000000 | 0.015761 | 0.065072 | -0.048271 | -0.002016 | -0.100129 | -0.127182 | -0.074539 |
CodeSum | 0.037840 | 0.079295 | 0.015761 | 1.000000 | 0.343029 | -0.014655 | -0.093113 | 0.002478 | 0.045178 | 0.095038 |
PrecipTotal | 0.084432 | 0.064817 | 0.065072 | 0.343029 | 1.000000 | -0.024844 | -0.007776 | -0.027002 | 0.056042 | 0.095667 |
StnPressure | -0.059296 | -0.046830 | -0.048271 | -0.014655 | -0.024844 | 1.000000 | 0.011190 | -0.020700 | 0.013248 | -0.031916 |
SeaLevel | -0.036384 | -0.032197 | -0.002016 | -0.093113 | -0.007776 | 0.011190 | 1.000000 | -0.028038 | 0.027229 | -0.007692 |
ResultSpeed | -0.158167 | 0.020585 | -0.100129 | 0.002478 | -0.027002 | -0.020700 | -0.028038 | 1.000000 | 0.094227 | 0.911711 |
ResultDir | 0.024257 | 0.107721 | -0.127182 | 0.045178 | 0.056042 | 0.013248 | 0.027229 | 0.094227 | 1.000000 | 0.154508 |
AvgSpeed | -0.140962 | 0.058628 | -0.074539 | 0.095038 | 0.095667 | -0.031916 | -0.007692 | 0.911711 | 0.154508 | 1.000000 |
sns.heatmap(weather.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x1a0f060860>
sns.heatmap(test.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x1a0d6e9c18>
sns.heatmap(spray.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x1a0d567470>
sns.heatmap(weather.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x109dae6d8>