import pandas as pd
from sklearn import ensemble
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train.ix[:0,:11]
Id | Elevation | Aspect | Slope | Horizontal_Distance_To_Hydrology | Vertical_Distance_To_Hydrology | Horizontal_Distance_To_Roadways | Hillshade_9am | Hillshade_Noon | Hillshade_3pm | Horizontal_Distance_To_Fire_Points | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2596 | 51 | 3 | 258 | 0 | 510 | 221 | 232 | 148 | 6279 |
1 rows × 11 columns
train.ix[:,:11].hist(figsize=(16,12),bins=50)
plt.show()
def r(x):
if x+180>360:
return x-180
else:
return x+180
train['Aspect2'] = train.Aspect.map(r)
test['Aspect2'] = test.Aspect.map(r)
train['Highwater'] = train.Vertical_Distance_To_Hydrology < 0
test['Highwater'] = test.Vertical_Distance_To_Hydrology < 0
import numpy as np
from IPython.display import Image
def plotc(c1,c2):
fig = plt.figure(figsize=(16,8))
sel = np.array(list(train.Cover_Type.values))
plt.scatter(c1, c2, c=sel, s=100)
plt.xlabel(c1.name)
plt.ylabel(c2.name)
plotc(train.Elevation, train.Vertical_Distance_To_Hydrology)
plotc(train.Elevation-train.Vertical_Distance_To_Hydrology, train.Vertical_Distance_To_Hydrology)
train['EVDtH'] = train.Elevation-train.Vertical_Distance_To_Hydrology
test['EVDtH'] = test.Elevation-test.Vertical_Distance_To_Hydrology
train['EHDtH'] = train.Elevation-train.Horizontal_Distance_To_Hydrology*0.2
test['EHDtH'] = test.Elevation-test.Horizontal_Distance_To_Hydrology*0.2
train['Distanse_to_Hydrolody'] = (train['Horizontal_Distance_To_Hydrology']**2+train['Vertical_Distance_To_Hydrology']**2)**0.5
test['Distanse_to_Hydrolody'] = (test['Horizontal_Distance_To_Hydrology']**2+test['Vertical_Distance_To_Hydrology']**2)**0.5
train['Hydro_Fire_1'] = train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Fire_Points']
test['Hydro_Fire_1'] = test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Fire_Points']
train['Hydro_Fire_2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Fire_Points'])
test['Hydro_Fire_2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Fire_Points'])
train['Hydro_Road_1'] = abs(train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Roadways'])
test['Hydro_Road_1'] = abs(test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Roadways'])
train['Hydro_Road_2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Roadways'])
test['Hydro_Road_2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Roadways'])
train['Fire_Road_1'] = abs(train['Horizontal_Distance_To_Fire_Points']+train['Horizontal_Distance_To_Roadways'])
test['Fire_Road_1'] = abs(test['Horizontal_Distance_To_Fire_Points']+test['Horizontal_Distance_To_Roadways'])
train['Fire_Road_2'] = abs(train['Horizontal_Distance_To_Fire_Points']-train['Horizontal_Distance_To_Roadways'])
test['Fire_Road_2'] = abs(test['Horizontal_Distance_To_Fire_Points']-test['Horizontal_Distance_To_Roadways'])
plotc(train.Hillshade_3pm, train.Hillshade_Noon)
feature_cols = [col for col in train.columns if col not in ['Cover_Type','Id']]
X_train = train[feature_cols]
X_test = test[feature_cols]
y = train['Cover_Type']
test_ids = test['Id']
forest = ensemble.ExtraTreesClassifier(n_estimators=400, criterion='gini', max_depth=None,
min_samples_split=2, min_samples_leaf=1, max_features='auto',
bootstrap=False, oob_score=False, n_jobs=-1, random_state=None, verbose=0,
min_density=None)
forest.fit(X_train, y)
with open('features_engineering_benchmark.csv', "wb") as outfile:
outfile.write("Id,Cover_Type\n")
for e, val in enumerate(list(forest.predict(X_test))):
outfile.write("%s,%s\n"%(test_ids[e],val))
pd.DataFrame(forest.feature_importances_,index=X_train.columns).sort([0], ascending=False) [:10]
0 | |
---|---|
EHDtH | 0.097486 |
Elevation | 0.096896 |
EVDtH | 0.092564 |
Wilderness_Area4 | 0.046237 |
Fire_Road_1 | 0.033677 |
Hydro_Road_2 | 0.032912 |
Horizontal_Distance_To_Roadways | 0.031300 |
Hydro_Road_1 | 0.030773 |
Distanse_to_Hydrolody | 0.028715 |
Horizontal_Distance_To_Hydrology | 0.027839 |
10 rows × 1 columns