import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = 14, 10 # увеличиваем размер картинок
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
Объективные признаки:
Результаты измерения:
Субъективные признаки:
data = pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id')
data.head()
age | gender | height | weight | ap_hi | ap_lo | cholesterol | gluc | smoke | alco | active | cardio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||
0 | 18393 | 2 | 168 | 62.0 | 110 | 80 | 1 | 1 | 0 | 0 | 1 | 0 |
1 | 20228 | 1 | 156 | 85.0 | 140 | 90 | 3 | 1 | 0 | 0 | 1 | 1 |
2 | 18857 | 1 | 165 | 64.0 | 130 | 70 | 3 | 1 | 0 | 0 | 0 | 1 |
3 | 17623 | 2 | 169 | 82.0 | 150 | 100 | 1 | 1 | 0 | 0 | 1 | 1 |
4 | 17474 | 1 | 156 | 56.0 | 100 | 60 | 1 | 1 | 0 | 0 | 0 | 0 |
data.describe()
age | gender | height | weight | ap_hi | ap_lo | cholesterol | gluc | smoke | alco | active | cardio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 |
mean | 19468.865814 | 1.349571 | 164.359229 | 74.205690 | 128.817286 | 96.630414 | 1.366871 | 1.226457 | 0.088129 | 0.053771 | 0.803729 | 0.499700 |
std | 2467.251667 | 0.476838 | 8.210126 | 14.395757 | 154.011419 | 188.472530 | 0.680250 | 0.572270 | 0.283484 | 0.225568 | 0.397179 | 0.500003 |
min | 10798.000000 | 1.000000 | 55.000000 | 10.000000 | -150.000000 | -70.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 17664.000000 | 1.000000 | 159.000000 | 65.000000 | 120.000000 | 80.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
50% | 19703.000000 | 1.000000 | 165.000000 | 72.000000 | 120.000000 | 80.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
75% | 21327.000000 | 2.000000 | 170.000000 | 82.000000 | 140.000000 | 90.000000 | 2.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
max | 23713.000000 | 2.000000 | 250.000000 | 200.000000 | 16020.000000 | 11000.000000 | 3.000000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
def get_X(df):
return df.drop(['cardio'], axis=1)
def get_Y(df):
return df['cardio']
def examine(clf, df, random_state=42, scoring='neg_log_loss'):
X, y = get_X(df), get_Y(df)
folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std()))
from sklearn.metrics import make_scorer, log_loss
log_loss_score = make_scorer(log_loss, greater_is_better=False)
def examine_scaled(clf, df, random_state=42, scoring='neg_log_loss'):
X, y = get_X(df), get_Y(df)
X = StandardScaler().fit_transform(X)
folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std()))
def describeImportance(clf, X):
indices = np.argsort(clf.feature_importances_)[::-1]
for f in range(X.shape[1]):
print('%d. feature %d %s (%f)' % (f + 1, indices[f], X.columns[indices[f]],
clf.feature_importances_[indices[f]]))
def describeCoef(clf, X):
coefs = clf.coef_
indices = np.argsort(np.abs(coefs))[::-1]
for f in range(X.shape[1]):
print('%d. feature %d %s (%f)' % (f, indices[f], X.columns[indices[f]], coefs[indices[f]]))
clf = Ridge()
examine_scaled(clf, data, scoring=log_loss_score)
Log Loss: -0.6588166038251394, std: 0.016590990820719284
clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, data)
Log Loss: -0.5779710936401263, std: 0.004090232271378004
clf = XGBClassifier(seed=42, nthread=4)
examine(clf, data)
Log Loss: -0.5396301910766, std: 0.005472783959989331
clf = DummyClassifier()
examine(clf, data)
Log Loss: -17.278269597099744, std: 0.10291377356163346
data[data['height'] > 200]
age | gender | height | weight | ap_hi | ap_lo | cholesterol | gluc | smoke | alco | active | cardio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||
9223 | 21220 | 1 | 250 | 86.0 | 140 | 100 | 3 | 1 | 0 | 0 | 1 | 1 |
30894 | 19054 | 2 | 207 | 78.0 | 100 | 70 | 1 | 1 | 0 | 1 | 1 | 0 |
data[(data['height'] < 100) & (data['weight'] >= 100)]
age | gender | height | weight | ap_hi | ap_lo | cholesterol | gluc | smoke | alco | active | cardio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||
11662 | 17646 | 2 | 97 | 170.0 | 160 | 100 | 1 | 1 | 1 | 0 | 1 | 1 |
18218 | 19594 | 1 | 75 | 168.0 | 120 | 80 | 1 | 1 | 1 | 0 | 1 | 1 |
34186 | 19074 | 1 | 81 | 156.0 | 140 | 90 | 1 | 1 | 0 | 0 | 1 | 0 |
39156 | 15292 | 1 | 80 | 178.0 | 140 | 90 | 3 | 3 | 0 | 0 | 1 | 1 |
sns.violinplot(y='height', x='gender', data=data[(data['height'] < 190) & (data['height'] > 140)])
<matplotlib.axes._subplots.AxesSubplot at 0x1115396a0>
def prepare(df):
df = df.copy()
df.replace('None', 0, inplace=True)
df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']] = df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']].apply(pd.to_numeric)
df['age'] = df['age'] // 365.25
df['ap_hi'] = np.abs(df['ap_hi'])
df['ap_lo'] = np.abs(df['ap_lo'])
df.loc[df['ap_hi'] >= 5000, 'ap_hi'] /= 100
df.loc[df['ap_hi'] >= 300, 'ap_hi'] /= 10
df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
df.loc[df['ap_lo'] >= 5000, 'ap_lo'] /= 100
df.loc[df['ap_lo'] >= 300, 'ap_lo'] /= 10
df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
df.loc[df['ap_lo'] < 1, 'ap_lo'] = 70
index = (df['ap_lo'] > df['ap_hi'])
ap_lo = df.loc[index, 'ap_lo']
ap_hi = df.loc[index, 'ap_hi']
df.loc[index, 'ap_hi'] = ap_lo
df.loc[index, 'ap_lo'] = ap_hi
# df['ap_delta'] = np.abs(df['ap_hi'] - df['ap_lo'])
df['ap_avg'] = np.round((df['ap_hi'] + 2 * df['ap_lo']) / 3)
df.loc[df['height'] > 210, 'height'] -= 100
df.loc[(df['height'] < 100) & (df['weight'] < 100), 'height'] += 100
index = (df['height'] < 100) & (df['weight'] >= 100)
h = df.loc[index, 'height']
w = df.loc[index, 'weight']
df.loc[index, 'height'] = w
df.loc[index, 'weight'] = h
df['imt'] = np.round(df['weight'] / ((df['height'] / 100.0) ** 2))
df['4*ch+gl'] = 4 * df['cholesterol'] + df['gluc']
df['w-h+100'] = df['weight'] - (df['height'] - 105)
# df = pd.get_dummies(df, columns=['gender', 'cholesterol', 'gluc'])
# ch_labels = df.groupby(by='cholesterol')['ap_avg'].median()
# df['cholesterol-ap_avg'] = df['cholesterol'].apply(lambda e: ch_labels[e])
# alco_labels = df.groupby(by='alco')['ap_avg'].median()
# df['alco-ap_avg'] = df['alco'].apply(lambda e: alco_labels[e])
# smoke_labels = df.groupby(by='smoke')['ap_avg'].median()
# df['smoke-ap_avg'] = df['smoke'].apply(lambda e: smoke_labels[e])
df['imt*ap_avg'] = df['imt'] * df['ap_avg']
# df['(4*ch+gl)*ap_avg'] = df['4*ch+gl'] * df['ap_avg']
return df.drop(['gender'], axis=1)
d = prepare(data)
d.head()
age | height | weight | ap_hi | ap_lo | cholesterol | gluc | smoke | alco | active | cardio | ap_delta | ap_avg | imt | 4*ch+gl | w-h+100 | imt*ap_avg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||
0 | 50.0 | 168.0 | 62.0 | 110.0 | 80.0 | 1 | 1 | 0 | 0 | 1 | 0 | 30.0 | 90.0 | 22.0 | 5 | -1.0 | 1980.0 |
1 | 55.0 | 156.0 | 85.0 | 140.0 | 90.0 | 3 | 1 | 0 | 0 | 1 | 1 | 50.0 | 107.0 | 35.0 | 13 | 34.0 | 3745.0 |
2 | 51.0 | 165.0 | 64.0 | 130.0 | 70.0 | 3 | 1 | 0 | 0 | 0 | 1 | 60.0 | 90.0 | 24.0 | 13 | 4.0 | 2160.0 |
3 | 48.0 | 169.0 | 82.0 | 150.0 | 100.0 | 1 | 1 | 0 | 0 | 1 | 1 | 50.0 | 117.0 | 29.0 | 5 | 18.0 | 3393.0 |
4 | 47.0 | 156.0 | 56.0 | 100.0 | 60.0 | 1 | 1 | 0 | 0 | 0 | 0 | 40.0 | 73.0 | 23.0 | 5 | 5.0 | 1679.0 |
d.describe()
age | height | weight | ap_hi | ap_lo | gluc | smoke | alco | active | cardio | ap_delta | ap_avg | imt | 4*ch+gl | w-h+100 | imt*ap_avg | (4*ch+gl)*ap_avg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 | 70000.000000 |
mean | 52.803257 | 164.398357 | 74.200847 | 127.082957 | 81.589693 | 1.226457 | 0.088129 | 0.053771 | 0.803729 | 0.499700 | 45.493265 | 96.656643 | 27.494843 | 6.693943 | 14.802490 | 2674.266014 | 653.579957 |
std | 6.762462 | 7.992059 | 14.378460 | 17.084971 | 9.779714 | 0.572270 | 0.283484 | 0.225568 | 0.397179 | 0.500003 | 11.827921 | 11.471790 | 5.383962 | 3.022861 | 14.197317 | 697.920760 | 327.402713 |
min | 29.000000 | 100.000000 | 10.000000 | 60.000000 | 24.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 44.000000 | 3.000000 | 5.000000 | -62.000000 | 309.000000 | 220.000000 |
25% | 48.000000 | 159.000000 | 65.000000 | 120.000000 | 80.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 40.000000 | 93.000000 | 24.000000 | 5.000000 | 5.000000 | 2232.000000 | 465.000000 |
50% | 53.000000 | 165.000000 | 72.000000 | 120.000000 | 80.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 40.000000 | 93.000000 | 26.000000 | 5.000000 | 12.000000 | 2520.000000 | 490.000000 |
75% | 58.000000 | 170.000000 | 82.000000 | 140.000000 | 90.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 50.000000 | 103.000000 | 30.000000 | 9.000000 | 23.000000 | 3000.000000 | 738.000000 |
max | 64.000000 | 207.000000 | 200.000000 | 240.000000 | 182.000000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 176.000000 | 187.000000 | 153.000000 | 15.000000 | 166.000000 | 19551.000000 | 2595.000000 |
#0.5388669688674089
clf = XGBClassifier(seed=42, nthread=4, n_estimators=100, max_depth=5)
examine(clf, d)
Log Loss: -0.5389820855843168, std: 0.0055894181462354435
#0.555136801754552
clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, d)
Log Loss: -0.5574158467263418, std: 0.00483120321114506
clf = Lasso(normalize=True, alpha=0.0000001)
examine_scaled(clf, d, scoring=log_loss_score)
clf.fit(get_X(d), get_Y(d))
describeCoef(clf, get_X(d))
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems. ConvergenceWarning) /usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems. ConvergenceWarning) /usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems. ConvergenceWarning) /usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems. ConvergenceWarning) /usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems. ConvergenceWarning)
Log Loss: -0.6805987237215099, std: 0.02503242794013113 0. feature 5 cholesterol (0.667676) 1. feature 6 gluc (0.117905) 2. feature 9 active (-0.041944) 3. feature 11 ap_avg (0.040548) 4. feature 8 alco (-0.037981) 5. feature 13 4*ch+gl (-0.031172) 6. feature 7 smoke (-0.027784) 7. feature 12 imt (0.023266) 8. feature 2 weight (0.015991) 9. feature 1 height (-0.014786) 10. feature 14 w-h+100 (-0.012969) 11. feature 0 age (0.009825) 12. feature 4 ap_lo (-0.009407) 13. feature 10 ap_delta (0.005307) 14. feature 3 ap_hi (-0.004212) 15. feature 16 (4*ch+gl)*ap_avg (-0.001119) 16. feature 15 imt*ap_avg (-0.000258)
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(10,5))
examine_scaled(clf, d)
Log Loss: -0.5415010024672596, std: 0.005026404506139066
# sns.distplot(data['age'])
sns.distplot(data[data['cardio'] == 1]['age'], color='red')
sns.distplot(data[data['cardio'] != 1]['age'], color='green')
<matplotlib.axes._subplots.AxesSubplot at 0x10f966470>
sns.distplot(data[data['cardio'] == 1]['height'])
sns.distplot(data[data['cardio'] != 1]['height'])
<matplotlib.axes._subplots.AxesSubplot at 0x113e233c8>
sns.distplot(d[d['cardio'] == 1]['weight'], color='red')
sns.distplot(d[d['cardio'] != 1]['weight'], color='green')
<matplotlib.axes._subplots.AxesSubplot at 0x1161b5978>
sns.distplot(d[(d['cardio'] == 1)]['imt'], color='red')
sns.distplot(d[(d['cardio'] == 0)]['imt'], color='green')
<matplotlib.axes._subplots.AxesSubplot at 0x113cbd128>
print(d['ap_hi'].min())
print(d['ap_hi'].max())
sns.distplot(d[d['ap_hi'] < 250]['ap_hi'])
1 16020
<matplotlib.axes._subplots.AxesSubplot at 0x1160cb320>
sns.distplot(d[(d['cardio'] == 1) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='red')
sns.distplot(d[(d['cardio'] == 0) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='green')
<matplotlib.axes._subplots.AxesSubplot at 0x1198b3400>
sns.distplot(data[(data['cardio'] == 1)]['gluc'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['gluc'], color='green', kde=False)
<matplotlib.axes._subplots.AxesSubplot at 0x11a5ec400>
sns.distplot(data[(data['cardio'] == 1)]['cholesterol'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['cholesterol'], color='green', kde=False)
<matplotlib.axes._subplots.AxesSubplot at 0x1211d4400>
d2 = data.copy()
d2['4*ch+gl'] = 4 * d2['cholesterol'] + d2['gluc']
sns.distplot(d2[(d2['cardio'] == 1)]['4*ch+gl'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['4*ch+gl'], color='green', kde=False)
<matplotlib.axes._subplots.AxesSubplot at 0x117a24f28>
d2 = d.copy()
d2['w-h+100'] = d2['weight'] - (d2['height'] - 105)
sns.distplot(d2[(d2['cardio'] == 1)]['w-h+100'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['w-h+100'], color='green', kde=False)
<matplotlib.axes._subplots.AxesSubplot at 0x11f71f7f0>
def scatter(df, x, y):
plt.scatter(df[(df['cardio'] == 1)][x], df[(df['cardio'] == 1)][y], color='red', alpha=0.5)
plt.scatter(df[(df['cardio'] == 0)][x], df[(df['cardio'] == 0)][y], color='green', alpha=0.5)
scatter(d2, 'w-h+100', 'ap_avg')
d2 = d.copy()
sns.heatmap(d2.corr(), square=True)
<matplotlib.axes._subplots.AxesSubplot at 0x11587f400>
# d.groupby(by='cholesterol')['ap_avg'].median()
d.groupby(by='smoke')['ap_avg'].mean()
smoke 0 96.652338 1 97.807198 Name: ap_avg, dtype: float64
def trend(df, column):
h = df.groupby([column])['cardio'].sum() / d.groupby([column])['cardio'].count()
sns.jointplot(h.index, h.values, kind="reg")
trend(d, 'ap_avg')
trend(d, 'ap_delta')
trend(d, 'imt*ap_avg')
trend(d, '4*ch+gl')
trend(d, 'w-h+100')
trend(d, '(4*ch+gl)*ap_avg')
def predict(clf, fname='submission_baseline_2.csv'):
train = prepare(pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id'))
test = prepare(pd.read_csv('data/test.csv.gz', compression='gzip', delimiter=';', index_col='id'))
clf.fit(get_X(train), get_Y(train))
pred = clf.predict_proba(test)
result = pd.DataFrame(index=test.index)
result['cardio'] = pred[:, 1]
result.to_csv(fname, header=None, index=None)
predict(XGBClassifier(seed=42, nthread=4, n_estimators=1000, max_depth=6))
from sklearn.ensemble import VotingClassifier
estimators = [('xgb_{}'.format(i), XGBClassifier(seed=i * 42, nthread=4, n_estimators=500, max_depth=(3+(i%3)))) for i in range(50)]
clf = VotingClassifier(estimators, voting='soft')
predict(clf, 'submission_ensemble_1.csv')