import pandas_ml as pdml
train=pd.read_csv("train.csv",index_col="PassengerId")
test=pd.read_csv("test.csv",index_col="PassengerId")
train.head(3)
Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
PassengerId | |||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
test.head(3)
Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|
PassengerId | ||||||||||
892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
test.Survived=3
dat=pd.concat([train,test])
dat.head()
Age | Cabin | Embarked | Fare | Name | Parch | Pclass | Sex | SibSp | Survived | Ticket | |
---|---|---|---|---|---|---|---|---|---|---|---|
PassengerId | |||||||||||
1 | 22.0 | NaN | S | 7.2500 | Braund, Mr. Owen Harris | 0 | 3 | male | 1 | 0.0 | A/5 21171 |
2 | 38.0 | C85 | C | 71.2833 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 1 | female | 1 | 1.0 | PC 17599 |
3 | 26.0 | NaN | S | 7.9250 | Heikkinen, Miss. Laina | 0 | 3 | female | 0 | 1.0 | STON/O2. 3101282 |
4 | 35.0 | C123 | S | 53.1000 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 1 | female | 1 | 1.0 | 113803 |
5 | 35.0 | NaN | S | 8.0500 | Allen, Mr. William Henry | 0 | 3 | male | 0 | 0.0 | 373450 |
Name,Ticket,Cabinは面倒ので今回は抜いちゃう。
本当はCabin,Ticketはなんらかの情報が得られそう。
ignore_feature=["Name","Ticket","Cabin"]
dat=dat.ix[:,[i for i in dat.columns.tolist() if i not in ignore_feature]]
dat.head()
Age | Embarked | Fare | Parch | Pclass | Sex | SibSp | Survived | |
---|---|---|---|---|---|---|---|---|
PassengerId | ||||||||
1 | 22.0 | S | 7.2500 | 0 | 3 | male | 1 | 0.0 |
2 | 38.0 | C | 71.2833 | 0 | 1 | female | 1 | 1.0 |
3 | 26.0 | S | 7.9250 | 0 | 3 | female | 0 | 1.0 |
4 | 35.0 | S | 53.1000 | 0 | 1 | female | 1 | 1.0 |
5 | 35.0 | S | 8.0500 | 0 | 3 | male | 0 | 0.0 |
欠損値を確認する。
dat.apply(pd.isnull, axis=0).sum()
Age 263 Embarked 2 Fare 1 Parch 0 Pclass 0 Sex 0 SibSp 0 Survived 418 dtype: int64
欠損値を埋める。 Ageは中央値にする。
dat.Age.fillna(dat.Age.median(), inplace=True)
dat.apply(pd.isnull, axis=0).sum()
Age 0 Embarked 2 Fare 1 Parch 0 Pclass 0 Sex 0 SibSp 0 Survived 418 dtype: int64
dat[dat.Embarked.isnull()]
Age | Embarked | Fare | Parch | Pclass | Sex | SibSp | Survived | |
---|---|---|---|---|---|---|---|---|
PassengerId | ||||||||
62 | 38.0 | NaN | 80.0 | 0 | 1 | female | 0 | 1.0 |
830 | 62.0 | NaN | 80.0 | 0 | 1 | female | 0 | 1.0 |
dat.groupby("Embarked").count()
Age | Fare | Parch | Pclass | Sex | SibSp | Survived | |
---|---|---|---|---|---|---|---|
Embarked | |||||||
C | 270 | 270 | 270 | 270 | 270 | 270 | 168 |
Q | 123 | 123 | 123 | 123 | 123 | 123 | 77 |
S | 914 | 913 | 914 | 914 | 914 | 914 | 644 |
dat.Embarked.fillna("S",inplace=True)
dat[dat.Fare.isnull()]
Age | Embarked | Fare | Parch | Pclass | Sex | SibSp | Survived | |
---|---|---|---|---|---|---|---|---|
PassengerId | ||||||||
1044 | 60.5 | S | NaN | 0 | 3 | male | 0 | NaN |
dat.groupby("Pclass").mean()
Age | Fare | Parch | SibSp | Survived | |
---|---|---|---|---|---|
Pclass | |||||
1 | 37.812446 | 87.508992 | 0.365325 | 0.436533 | 0.629630 |
2 | 29.419675 | 21.179196 | 0.368231 | 0.393502 | 0.472826 |
3 | 25.750353 | 13.302889 | 0.400564 | 0.568406 | 0.242363 |
dat.Fare.fillna(dat[dat.Pclass==3].Fare.mean(),inplace=True)
dat[dat.index==1044]
Age | Embarked | Fare | Parch | Pclass | Sex | SibSp | Survived | |
---|---|---|---|---|---|---|---|---|
PassengerId | ||||||||
1044 | 60.5 | S | 13.302889 | 0 | 3 | male | 0 | NaN |
dat.apply(pd.isnull, axis=0).sum()
Age 0 Embarked 0 Fare 0 Parch 0 Pclass 0 Sex 0 SibSp 0 Survived 418 dtype: int64
カテゴリカル変数をダミー変数に置き換える。
for feature in ["Sex","Embarked"]:
dat[feature]=dat[feature].map({i:j for j,i in enumerate(dat[feature].unique())})
dat.head()
Age | Embarked | Fare | Parch | Pclass | Sex | SibSp | Survived | |
---|---|---|---|---|---|---|---|---|
PassengerId | ||||||||
1 | 22.0 | 0 | 7.2500 | 0 | 3 | 0 | 1 | 0.0 |
2 | 38.0 | 1 | 71.2833 | 0 | 1 | 1 | 1 | 1.0 |
3 | 26.0 | 0 | 7.9250 | 0 | 3 | 1 | 0 | 1.0 |
4 | 35.0 | 0 | 53.1000 | 0 | 1 | 1 | 1 | 1.0 |
5 | 35.0 | 0 | 8.0500 | 0 | 3 | 0 | 0 | 0.0 |
mf=pdml.ModelFrame(dat,target="Survived")
やっと本番
clf=mf.ensemble.RandomForestClassifier()
mf[~mf.Survived.isnull()].fit(clf)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
pred_default=mf[mf.Survived.isnull()].predict(clf)
grid seachで木の数を探索
params={'n_estimators':[2**i for i in range(1,10)]}
grid_clf=mf.grid_search.GridSearchCV(clf,params,cv=5)
mf[~mf.Survived.isnull()].fit(grid_clf)
GridSearchCV(cv=5, error_score='raise', estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False), fit_params={}, iid=True, n_jobs=1, param_grid={'n_estimators': [2, 4, 8, 16, 32, 64, 128, 256, 512]}, pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
grid_clf.best_estimator_
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=256, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
grid_clf.best_score_
0.81369248035914699
grid_clf.grid_scores_
[mean: 0.77778, std: 0.01847, params: {'n_estimators': 2}, mean: 0.79798, std: 0.02272, params: {'n_estimators': 4}, mean: 0.80808, std: 0.03727, params: {'n_estimators': 8}, mean: 0.80920, std: 0.03631, params: {'n_estimators': 16}, mean: 0.80247, std: 0.01994, params: {'n_estimators': 32}, mean: 0.80920, std: 0.03673, params: {'n_estimators': 64}, mean: 0.81257, std: 0.03057, params: {'n_estimators': 128}, mean: 0.81369, std: 0.03155, params: {'n_estimators': 256}, mean: 0.80696, std: 0.03026, params: {'n_estimators': 512}]
pred_grid=pd.DataFrame(mf[mf.Survived.isnull()].predict(grid_clf.best_estimator_),columns=["Survived"],dtype=int)
pred_grid.head()
Survived | |
---|---|
PassengerId | |
892 | 0 |
893 | 0 |
894 | 0 |
895 | 1 |
896 | 0 |
pred_grid.to_csv("predict_rf.csv")