from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
def plot_confusion_matrix(cm):
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.set_title('Confusion Matrix')
fig.colorbar(im)
target_names = ['not survived', 'survived']
tick_marks = np.arange(len(target_names))
ax.set_xticks(tick_marks)
ax.set_xticklabels(target_names, rotation=45)
ax.set_yticks(tick_marks)
ax.set_yticklabels(target_names)
ax.set_ylabel('True label')
ax.set_xlabel('Predicted label')
fig.tight_layout()
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')
df_train.drop('PassengerId', axis=1, inplace=True)
df_train.head(2)
Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | Braund, Mr. Owen Harris | male | 22 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
def _extract_title(name):
if name.find('Mr.') > 0:
return 'Mr'
elif name.find('Mrs.') > 0:
return 'Mrs'
elif name.find('Master.') > 0:
return 'Master'
elif name.find('Miss.') > 0:
return 'Miss'
else:
return None
def extract_title(df):
df['Title'] = df.Name.apply(lambda n: _extract_title(n))
title_bin = pd.get_dummies(df.Title)
title_bin.rename(columns=lambda x: 'title' + "_" + str(x), inplace=True)
df = df.join(title_bin)
return df
import math
def _fill_fare(row):
fare = row[0]
sibsp = row[1]
parch = row[2]
pclass = row[3]
family = min(max(1, sibsp + parch), 4)
if fare is None or fare == 0 or math.isnan(fare):
if pclass == 1:
fare = 86
elif pclass == 2:
fare = 21
else:
fare = 10
else:
pass
#fare = fare/float(family)
return np.log(fare)
def fill_fare(df):
df['FareFill'] = df[['Fare', 'SibSp', 'Parch', 'Pclass']].apply(_fill_fare, axis=1)
return df
def fill_age(df):
df['AgeFill'] = df.Age
df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 1)] = 40
df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 2)] = 31
df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 3)] = 26
df.AgeFill[df.Age.isnull() & (df.Title == 'Master')] = 3.5
df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 1)] = 41.5
df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 2)] = 32
df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 3)] = 31
df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 1)] = 30
df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 2)] = 24
df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 3)] = 18
df.AgeFill[df.AgeFill.isnull() & (df.Sex == 'female')] = 30
return df
def extract_pclass(df):
pclass_new = pd.get_dummies(df.Pclass)
pclass_new.rename(columns=lambda x: 'pclass' + "_" + str(x), inplace=True)
df = df.join(pclass_new)
return df
def extract_parch(df):
dm = pd.get_dummies(df.Parch.apply(lambda p: min(p, 4)))
dm.rename(columns=lambda x: 'parch' + "_" + str(x), inplace=True)
df = df.join(dm)
return df
def extract_sibsp(df):
dm = pd.get_dummies(df.SibSp.apply(lambda s: min(s, 4)))
dm.rename(columns=lambda x: 'sibsp' + "_" + str(x), inplace=True)
df = df.join(dm)
return df
def convert_sex(df):
df['male'] = df.Sex.apply(lambda s: 0 if s == 'male' else 1)
df['female'] = df.Sex.apply(lambda s: 1 if s == 'male' else 0)
return df
def extract_feature(df):
df = extract_title(df)
df = fill_age(df)
df = extract_pclass(df)
df = extract_sibsp(df)
df = extract_parch(df)
df = convert_sex(df)
df = fill_fare(df)
cols = df.columns
drop_cols = set(cols).intersection(set(['PassengerId', 'Title', 'Name', 'SibSp', 'Ticket', 'Fare', 'Pclass', 'Survived', 'Parch', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked', 'CCabin']))
return df.drop(drop_cols, axis=1)
def cross_val(X, y, K, random_state=0, clf=None, ):
if clf is None:
clf = get_classifier()
cv = KFold(len(y), K, shuffle=True, random_state=random_state)
scores = cross_val_score(clf, X, y, cv=cv)
#print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
return scores
X_train = extract_feature(df_train)
y_train = df_train.Survived
def grid_search_logi(df):
X_train = extract_feature(df)
y_train = df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.83, random_state=19)
test_parameters = [
{'penalty': ['l1'], 'C': [1000], 'gamma': [1e-3, 1e-4]},
{'penalty': ['l2'], 'C': [1, 10, 100, 1000]}
]
survived_weight = .75
y_weights = np.array([survived_weight if s == 1 else 1 for s in y_train])
clf = GridSearchCV(
SVC(C=1),
test_parameters,
cv=20,
scoring='accuracy',
n_jobs=-1,
fit_params={'sample_weight': y_weights}
)
clf.fit(X_train, y_train)
print clf.best_estimator_
print"\n+ トレーニングデータでCVした時の平均スコア:\n"
for params, mean_score, all_scores in clf.grid_scores_:
print "{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params)
print "\n+ テストデータでの識別結果:\n"
y_true, y_pred = y_val, clf.predict(X_val)
print classification_report(y_true, y_pred)
def grid_search_svc(df):
X_train = extract_feature(df)
y_train = df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.83, random_state=19)
test_parameters = [
{'kernel': ['rbf'], 'C': [1000], 'gamma': [1e-3, 1e-4]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
]
survived_weight = .75
y_weights = np.array([survived_weight if s == 1 else 1 for s in y_train])
clf = GridSearchCV(
SVC(C=1),
test_parameters,
cv=20,
scoring='accuracy',
n_jobs=-1,
fit_params={'sample_weight': y_weights}
)
clf.fit(X_train, y_train)
print clf.best_estimator_
print"\n+ トレーニングデータでCVした時の平均スコア:\n"
for params, mean_score, all_scores in clf.grid_scores_:
print "{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params)
print "\n+ テストデータでの識別結果:\n"
y_true, y_pred = y_val, clf.predict(X_val)
print classification_report(y_true, y_pred)
def grid_search_random_forest(df):
X_train = extract_feature(df)
y_train = df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.83, random_state=19)
survived_weight = .75
y_weights = np.array([survived_weight if s == 1 else 1 for s in y_train])
sqrtfeat = int(np.sqrt(X_train.shape[1]))
test_parameters = {
'n_estimators': [100, 1000],
# 'max_depth': [5, 6, 7, 8],
# 'min_samples_leaf': [1, 2, 3],
# 'max_features': np.rint(np.linspace(sqrtfeat, sqrtfeat+2, 3)).astype(int),
# 'min_samples_split': np.rint(np.linspace(X_train.shape[0]*.01, X_train.shape[0]*.05, 3)).astype(int)
}
clf = GridSearchCV(
RandomForestClassifier(oob_score=True, n_estimators=10000),
test_parameters,
cv=20,
scoring='accuracy',
n_jobs=-1,
fit_params={'sample_weight': y_weights}
)
clf.fit(X_train, y_train)
print clf.best_estimator_
print"\n+ トレーニングデータでCVした時の平均スコア:\n"
for params, mean_score, all_scores in clf.grid_scores_:
print "{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params)
print "\n+ テストデータでの識別結果:\n"
y_true, y_pred = y_val, clf.predict(X_val)
print classification_report(y_true, y_pred)
return clf
#clf = grid_search_random_forest(df_train)
print "============================================"
#grid_search_svc(df_train)
print "============================================"
============================================ ============================================
print "Logistic Regression"
cross_val(X_train, y_train, 20, clf=LogisticRegression(penalty='l2', tol=0.01))
print "Random Forest"
cross_val(X_train, y_train, 20, clf=RandomForestClassifier())
cross_val(X_train, y_train, 20, clf=RandomForestClassifier(bootstrap=True, compute_importances=None,
criterion='gini', max_depth=8, max_features=6,
max_leaf_nodes=None, min_density=None, min_samples_leaf=2,
min_samples_split=7, n_estimators=100, n_jobs=1,
oob_score=True, random_state=None, verbose=0))
print "SVN (grid beast)"
cross_val(X_train, y_train, 20, clf=SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0, degree=3,
gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
random_state=None, shrinking=True, tol=0.001, verbose=False))
print "SVN (L2)"
cross_val(X_train, y_train, 20, clf=LinearSVC(penalty='l2'))
print "SVN (L1)"
cross_val(X_train, y_train, 20, clf=LinearSVC(penalty='l1', dual=False))
print "SVN"
cross_val(X_train, y_train, 20, clf=SVC())
print "Decision Tree"
cross_val(X_train, y_train, 20, clf=DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=2))
Logistic Regression Mean Score: 0.816 (+/-0.086) Random Forest Mean Score: 0.815 (+/-0.093) Mean Score: 0.829 (+/-0.070) SVN (grid beast) Mean Score: 0.831 (+/-0.063) SVN (L2) Mean Score: 0.810 (+/-0.115) SVN (L1) Mean Score: 0.822 (+/-0.078) SVN Mean Score: 0.814 (+/-0.092) Decision Tree Mean Score: 0.819 (+/-0.094)
array([ 0.82222222, 0.77777778, 0.77777778, 0.77777778, 0.82222222, 0.82222222, 0.82222222, 0.82222222, 0.84444444, 0.86666667, 0.84444444, 0.75 , 0.88636364, 0.72727273, 0.77272727, 0.84090909, 0.77272727, 0.88636364, 0.84090909, 0.90909091])
def calc_classifier(df, clf=None):
X_train = extract_feature(df)
y_train = df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.90, random_state=13)
print('Num of Training Samples: {}'.format(len(X_train)))
print('Num of Validation Samples: {}'.format(len(X_val)))
survived_weight = .8
y_weights = np.array([survived_weight if s == 1 else 1 for s in y_train])
if clf is None:
clf = get_classifier()
#clf.fit(X_train, y_train, sample_weight=y_weights)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))
cm = confusion_matrix(y_val, y_val_pred)
return clf
_clf = RandomForestClassifier(bootstrap=True, compute_importances=None,
criterion='gini', max_depth=8, max_features=10,
max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
min_samples_split=7, n_estimators=1000, n_jobs=-1,
oob_score=True, random_state=None, verbose=0)
_clf = SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0, degree=3,
gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
random_state=None, shrinking=True, tol=0.001, verbose=False)
#_clf = LinearSVC()
clf = calc_classifier(df_train, clf=LogisticRegression())
Num of Training Samples: 801 Num of Validation Samples: 90 Accuracy on Training Set: 0.820 Accuracy on Validation Set: 0.844
X_train.head()
title_Master | title_Miss | title_Mr | title_Mrs | AgeFill | pclass_1 | pclass_2 | pclass_3 | male | female | FareFill | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 22 | 0 | 0 | 1 | 0 | 1 | 1.981001 |
1 | 0 | 0 | 0 | 1 | 38 | 1 | 0 | 0 | 1 | 0 | 4.266662 |
2 | 0 | 1 | 0 | 0 | 26 | 0 | 0 | 1 | 1 | 0 | 2.070022 |
3 | 0 | 0 | 0 | 1 | 35 | 1 | 0 | 0 | 1 | 0 | 3.972177 |
4 | 0 | 0 | 1 | 0 | 35 | 0 | 0 | 1 | 0 | 1 | 2.085672 |
Y = extract_feature(df_test)
df_test['Survived'] = clf.predict(Y)
submit_data = df_test[['PassengerId', 'Survived']]
Y.head()
title_Master | title_Miss | title_Mr | title_Mrs | AgeFill | pclass_1 | pclass_2 | pclass_3 | sibsp_0 | sibsp_1 | ... | sibsp_3 | sibsp_4 | parch_0 | parch_1 | parch_2 | parch_3 | parch_4 | male | female | FareFill | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 34.5 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2.057860 |
1 | 0 | 0 | 0 | 1 | 47.0 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1.945910 |
2 | 0 | 0 | 1 | 0 | 62.0 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2.270836 |
3 | 0 | 0 | 1 | 0 | 27.0 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2.159003 |
4 | 0 | 0 | 0 | 1 | 22.0 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 2.508582 |
5 rows × 21 columns
submit_data.to_csv('./submit_20150312_grid_8.csv', index=False)
!open .