In [1]:

import pandas_ml as pdml

In [2]:

train=pd.read_csv("train.csv",index_col="PassengerId")
test=pd.read_csv("test.csv",index_col="PassengerId")

In [3]:

train.head(3)

Out[3]:

	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
PassengerId
1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S

In [4]:

test.head(3)

Out[4]:

	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
PassengerId
892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q

In [5]:

test.Survived=3

In [6]:

dat=pd.concat([train,test])
dat.head()

Out[6]:

	Age	Cabin	Embarked	Fare	Name	Parch	Pclass	Sex	SibSp	Survived	Ticket
PassengerId
1	22.0	NaN	S	7.2500	Braund, Mr. Owen Harris	0	3	male	1	0.0	A/5 21171
2	38.0	C85	C	71.2833	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	1	female	1	1.0	PC 17599
3	26.0	NaN	S	7.9250	Heikkinen, Miss. Laina	0	3	female	0	1.0	STON/O2. 3101282
4	35.0	C123	S	53.1000	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	1	female	1	1.0	113803
5	35.0	NaN	S	8.0500	Allen, Mr. William Henry	0	3	male	0	0.0	373450

Name,Ticket,Cabinは面倒ので今回は抜いちゃう。
本当はCabin,Ticketはなんらかの情報が得られそう。

In [7]:

ignore_feature=["Name","Ticket","Cabin"]
dat=dat.ix[:,[i for i in dat.columns.tolist() if i not in ignore_feature]]

In [8]:

dat.head()

Out[8]:

	Age	Embarked	Fare	Parch	Pclass	Sex	SibSp	Survived
PassengerId
1	22.0	S	7.2500	0	3	male	1	0.0
2	38.0	C	71.2833	0	1	female	1	1.0
3	26.0	S	7.9250	0	3	female	0	1.0
4	35.0	S	53.1000	0	1	female	1	1.0
5	35.0	S	8.0500	0	3	male	0	0.0

欠損値を確認する。

In [9]:

dat.apply(pd.isnull, axis=0).sum()

Out[9]:

Age         263
Embarked      2
Fare          1
Parch         0
Pclass        0
Sex           0
SibSp         0
Survived    418
dtype: int64

欠損値を埋める。 Ageは中央値にする。

In [10]:

dat.Age.fillna(dat.Age.median(), inplace=True)

In [11]:

dat.apply(pd.isnull, axis=0).sum()

Out[11]:

Age           0
Embarked      2
Fare          1
Parch         0
Pclass        0
Sex           0
SibSp         0
Survived    418
dtype: int64

In [12]:

dat[dat.Embarked.isnull()]

Out[12]:

	Age	Embarked	Fare	Parch	Pclass	Sex	SibSp	Survived
PassengerId
62	38.0	NaN	80.0	0	1	female	0	1.0
830	62.0	NaN	80.0	0	1	female	0	1.0

In [13]:

dat.groupby("Embarked").count()

Out[13]:

	Age	Fare	Parch	Pclass	Sex	SibSp	Survived
Embarked
C	270	270	270	270	270	270	168
Q	123	123	123	123	123	123	77
S	914	913	914	914	914	914	644

In [14]:

dat.Embarked.fillna("S",inplace=True)

In [15]:

dat[dat.Fare.isnull()]

Out[15]:

	Age	Embarked	Fare	Parch	Pclass	Sex	SibSp	Survived
PassengerId
1044	60.5	S	NaN	0	3	male	0	NaN

In [16]:

dat.groupby("Pclass").mean()

Out[16]:

	Age	Fare	Parch	SibSp	Survived
Pclass
1	37.812446	87.508992	0.365325	0.436533	0.629630
2	29.419675	21.179196	0.368231	0.393502	0.472826
3	25.750353	13.302889	0.400564	0.568406	0.242363

In [17]:

dat.Fare.fillna(dat[dat.Pclass==3].Fare.mean(),inplace=True)

In [18]:

dat[dat.index==1044]

Out[18]:

	Age	Embarked	Fare	Parch	Pclass	Sex	SibSp	Survived
PassengerId
1044	60.5	S	13.302889	0	3	male	0	NaN

In [19]:

dat.apply(pd.isnull, axis=0).sum()

Out[19]:

Age           0
Embarked      0
Fare          0
Parch         0
Pclass        0
Sex           0
SibSp         0
Survived    418
dtype: int64

カテゴリカル変数をダミー変数に置き換える。

In [20]:

for feature in ["Sex","Embarked"]:
    dat[feature]=dat[feature].map({i:j for j,i in enumerate(dat[feature].unique())})
dat.head()

Out[20]:

	Age	Embarked	Fare	Parch	Pclass	Sex	SibSp	Survived
PassengerId
1	22.0	0	7.2500	0	3	0	1	0.0
2	38.0	1	71.2833	0	1	1	1	1.0
3	26.0	0	7.9250	0	3	1	0	1.0
4	35.0	0	53.1000	0	1	1	1	1.0
5	35.0	0	8.0500	0	3	0	0	0.0

In [21]:

mf=pdml.ModelFrame(dat,target="Survived")

やっと本番

In [22]:

clf=mf.ensemble.RandomForestClassifier()

In [23]:

mf[~mf.Survived.isnull()].fit(clf)

Out[23]:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:

pred_default=mf[mf.Survived.isnull()].predict(clf)

grid seachで木の数を探索

In [25]:

params={'n_estimators':[2**i for i in range(1,10)]}

In [26]:

grid_clf=mf.grid_search.GridSearchCV(clf,params,cv=5)

In [27]:

mf[~mf.Survived.isnull()].fit(grid_clf)

Out[27]:

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [2, 4, 8, 16, 32, 64, 128, 256, 512]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [28]:

grid_clf.best_estimator_

Out[28]:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=256, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:

grid_clf.best_score_

Out[29]:

0.81369248035914699

In [30]:

grid_clf.grid_scores_

Out[30]:

[mean: 0.77778, std: 0.01847, params: {'n_estimators': 2},
 mean: 0.79798, std: 0.02272, params: {'n_estimators': 4},
 mean: 0.80808, std: 0.03727, params: {'n_estimators': 8},
 mean: 0.80920, std: 0.03631, params: {'n_estimators': 16},
 mean: 0.80247, std: 0.01994, params: {'n_estimators': 32},
 mean: 0.80920, std: 0.03673, params: {'n_estimators': 64},
 mean: 0.81257, std: 0.03057, params: {'n_estimators': 128},
 mean: 0.81369, std: 0.03155, params: {'n_estimators': 256},
 mean: 0.80696, std: 0.03026, params: {'n_estimators': 512}]

In [36]:

pred_grid=pd.DataFrame(mf[mf.Survived.isnull()].predict(grid_clf.best_estimator_),columns=["Survived"],dtype=int)

In [37]:

pred_grid.head()

Out[37]:

	Survived
PassengerId
892	0
893	0
894	0
895	1
896	0

In [38]:

pred_grid.to_csv("predict_rf.csv")