In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = 14, 10 # увеличиваем размер картинок
import seaborn as sns

from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler

Объективные признаки:

  • Возраст
  • Рост
  • Вес
  • Пол

Результаты измерения:

  • Артериальное давление верхнее и нижнее
  • Холестерин
  • Глюкоза

Субъективные признаки:

  • Курение
  • Употребление Алкоголя
  • Физическая активность
In [2]:
data = pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id')
data.head()
Out[2]:
age gender height weight ap_hi ap_lo cholesterol gluc smoke alco active cardio
id
0 18393 2 168 62.0 110 80 1 1 0 0 1 0
1 20228 1 156 85.0 140 90 3 1 0 0 1 1
2 18857 1 165 64.0 130 70 3 1 0 0 0 1
3 17623 2 169 82.0 150 100 1 1 0 0 1 1
4 17474 1 156 56.0 100 60 1 1 0 0 0 0
In [6]:
data.describe()
Out[6]:
age gender height weight ap_hi ap_lo cholesterol gluc smoke alco active cardio
count 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000
mean 19468.865814 1.349571 164.359229 74.205690 128.817286 96.630414 1.366871 1.226457 0.088129 0.053771 0.803729 0.499700
std 2467.251667 0.476838 8.210126 14.395757 154.011419 188.472530 0.680250 0.572270 0.283484 0.225568 0.397179 0.500003
min 10798.000000 1.000000 55.000000 10.000000 -150.000000 -70.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 17664.000000 1.000000 159.000000 65.000000 120.000000 80.000000 1.000000 1.000000 0.000000 0.000000 1.000000 0.000000
50% 19703.000000 1.000000 165.000000 72.000000 120.000000 80.000000 1.000000 1.000000 0.000000 0.000000 1.000000 0.000000
75% 21327.000000 2.000000 170.000000 82.000000 140.000000 90.000000 2.000000 1.000000 0.000000 0.000000 1.000000 1.000000
max 23713.000000 2.000000 250.000000 200.000000 16020.000000 11000.000000 3.000000 3.000000 1.000000 1.000000 1.000000 1.000000
In [3]:
def get_X(df):
    return df.drop(['cardio'], axis=1)

def get_Y(df):
    return df['cardio']
In [4]:
def examine(clf, df, random_state=42, scoring='neg_log_loss'):
    X, y = get_X(df), get_Y(df) 
    folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
    print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std())) 
In [5]:
from sklearn.metrics import make_scorer, log_loss
log_loss_score = make_scorer(log_loss, greater_is_better=False)
In [5]:
def examine_scaled(clf, df, random_state=42, scoring='neg_log_loss'):
    X, y = get_X(df), get_Y(df)
    X = StandardScaler().fit_transform(X)
    folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
    print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std())) 
In [6]:
def describeImportance(clf, X):
    indices = np.argsort(clf.feature_importances_)[::-1]
    for f in range(X.shape[1]):
        print('%d. feature %d %s (%f)' % (f + 1, indices[f], X.columns[indices[f]],
                                          clf.feature_importances_[indices[f]]))
        
def describeCoef(clf, X):
    coefs = clf.coef_
    indices = np.argsort(np.abs(coefs))[::-1]
    for f in range(X.shape[1]):
        print('%d. feature %d %s (%f)' % (f, indices[f], X.columns[indices[f]], coefs[indices[f]]))
In [12]:
clf = Ridge()
examine_scaled(clf, data, scoring=log_loss_score)
Log Loss: -0.6588166038251394, std: 0.016590990820719284
In [14]:
clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, data)
Log Loss: -0.5779710936401263, std: 0.004090232271378004
In [15]:
clf = XGBClassifier(seed=42, nthread=4)
examine(clf, data)
Log Loss: -0.5396301910766, std: 0.005472783959989331
In [14]:
clf = DummyClassifier()
examine(clf, data)
Log Loss: -17.278269597099744, std: 0.10291377356163346
In [42]:
data[data['height'] > 200]
Out[42]:
age gender height weight ap_hi ap_lo cholesterol gluc smoke alco active cardio
id
9223 21220 1 250 86.0 140 100 3 1 0 0 1 1
30894 19054 2 207 78.0 100 70 1 1 0 1 1 0
In [67]:
data[(data['height'] < 100) & (data['weight'] >= 100)]
Out[67]:
age gender height weight ap_hi ap_lo cholesterol gluc smoke alco active cardio
id
11662 17646 2 97 170.0 160 100 1 1 1 0 1 1
18218 19594 1 75 168.0 120 80 1 1 1 0 1 1
34186 19074 1 81 156.0 140 90 1 1 0 0 1 0
39156 15292 1 80 178.0 140 90 3 3 0 0 1 1
In [57]:
sns.violinplot(y='height', x='gender', data=data[(data['height'] < 190) & (data['height'] > 140)])
Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0x1115396a0>
In [33]:
def prepare(df):
    df = df.copy()
    df.replace('None', 0, inplace=True)
    df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']] = df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']].apply(pd.to_numeric)
    df['age'] = df['age'] // 365.25
    df['ap_hi'] = np.abs(df['ap_hi'])
    df['ap_lo'] = np.abs(df['ap_lo'])
    df.loc[df['ap_hi'] >= 5000, 'ap_hi'] /= 100
    df.loc[df['ap_hi'] >= 300, 'ap_hi'] /= 10
    df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
    df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
    df.loc[df['ap_lo'] >= 5000, 'ap_lo'] /= 100
    df.loc[df['ap_lo'] >= 300, 'ap_lo'] /= 10
    df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
    df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
    df.loc[df['ap_lo'] < 1, 'ap_lo'] = 70
    
    index = (df['ap_lo'] > df['ap_hi'])
    ap_lo = df.loc[index, 'ap_lo']
    ap_hi = df.loc[index, 'ap_hi']
    df.loc[index, 'ap_hi'] = ap_lo
    df.loc[index, 'ap_lo'] = ap_hi
    
#     df['ap_delta'] = np.abs(df['ap_hi'] - df['ap_lo'])
    df['ap_avg'] = np.round((df['ap_hi'] + 2 * df['ap_lo']) / 3)
    df.loc[df['height'] > 210, 'height'] -= 100
    df.loc[(df['height'] < 100) & (df['weight'] < 100), 'height'] += 100
    
    index = (df['height'] < 100) & (df['weight'] >= 100)
    h = df.loc[index, 'height']
    w = df.loc[index, 'weight']
    df.loc[index, 'height'] = w
    df.loc[index, 'weight'] = h
    
    df['imt'] = np.round(df['weight'] / ((df['height'] / 100.0) ** 2))
    
    df['4*ch+gl'] = 4 * df['cholesterol'] + df['gluc']
    df['w-h+100'] = df['weight'] - (df['height'] - 105)
    
#     df = pd.get_dummies(df, columns=['gender', 'cholesterol', 'gluc'])
#     ch_labels = df.groupby(by='cholesterol')['ap_avg'].median()
#     df['cholesterol-ap_avg'] = df['cholesterol'].apply(lambda e: ch_labels[e])

#     alco_labels = df.groupby(by='alco')['ap_avg'].median()
#     df['alco-ap_avg'] = df['alco'].apply(lambda e: alco_labels[e])
    
#     smoke_labels = df.groupby(by='smoke')['ap_avg'].median()
#     df['smoke-ap_avg'] = df['smoke'].apply(lambda e: smoke_labels[e])
    
    df['imt*ap_avg'] = df['imt'] * df['ap_avg']
#     df['(4*ch+gl)*ap_avg'] = df['4*ch+gl'] * df['ap_avg']
    return df.drop(['gender'], axis=1)


d = prepare(data)
d.head()
Out[33]:
age height weight ap_hi ap_lo cholesterol gluc smoke alco active cardio ap_delta ap_avg imt 4*ch+gl w-h+100 imt*ap_avg
id
0 50.0 168.0 62.0 110.0 80.0 1 1 0 0 1 0 30.0 90.0 22.0 5 -1.0 1980.0
1 55.0 156.0 85.0 140.0 90.0 3 1 0 0 1 1 50.0 107.0 35.0 13 34.0 3745.0
2 51.0 165.0 64.0 130.0 70.0 3 1 0 0 0 1 60.0 90.0 24.0 13 4.0 2160.0
3 48.0 169.0 82.0 150.0 100.0 1 1 0 0 1 1 50.0 117.0 29.0 5 18.0 3393.0
4 47.0 156.0 56.0 100.0 60.0 1 1 0 0 0 0 40.0 73.0 23.0 5 5.0 1679.0
In [9]:
d.describe()
Out[9]:
age height weight ap_hi ap_lo gluc smoke alco active cardio ap_delta ap_avg imt 4*ch+gl w-h+100 imt*ap_avg (4*ch+gl)*ap_avg
count 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000
mean 52.803257 164.398357 74.200847 127.082957 81.589693 1.226457 0.088129 0.053771 0.803729 0.499700 45.493265 96.656643 27.494843 6.693943 14.802490 2674.266014 653.579957
std 6.762462 7.992059 14.378460 17.084971 9.779714 0.572270 0.283484 0.225568 0.397179 0.500003 11.827921 11.471790 5.383962 3.022861 14.197317 697.920760 327.402713
min 29.000000 100.000000 10.000000 60.000000 24.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 44.000000 3.000000 5.000000 -62.000000 309.000000 220.000000
25% 48.000000 159.000000 65.000000 120.000000 80.000000 1.000000 0.000000 0.000000 1.000000 0.000000 40.000000 93.000000 24.000000 5.000000 5.000000 2232.000000 465.000000
50% 53.000000 165.000000 72.000000 120.000000 80.000000 1.000000 0.000000 0.000000 1.000000 0.000000 40.000000 93.000000 26.000000 5.000000 12.000000 2520.000000 490.000000
75% 58.000000 170.000000 82.000000 140.000000 90.000000 1.000000 0.000000 0.000000 1.000000 1.000000 50.000000 103.000000 30.000000 9.000000 23.000000 3000.000000 738.000000
max 64.000000 207.000000 200.000000 240.000000 182.000000 3.000000 1.000000 1.000000 1.000000 1.000000 176.000000 187.000000 153.000000 15.000000 166.000000 19551.000000 2595.000000
In [35]:
#0.5388669688674089
clf = XGBClassifier(seed=42, nthread=4, n_estimators=100, max_depth=5)
examine(clf, d)
Log Loss: -0.5389820855843168, std: 0.0055894181462354435
In [36]:
#0.555136801754552
clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, d)
Log Loss: -0.5574158467263418, std: 0.00483120321114506
In [66]:
clf = Lasso(normalize=True, alpha=0.0000001)
examine_scaled(clf, d, scoring=log_loss_score)
clf.fit(get_X(d), get_Y(d))
describeCoef(clf, get_X(d))
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Log Loss: -0.6805987237215099, std: 0.02503242794013113
0. feature 5 cholesterol (0.667676)
1. feature 6 gluc (0.117905)
2. feature 9 active (-0.041944)
3. feature 11 ap_avg (0.040548)
4. feature 8 alco (-0.037981)
5. feature 13 4*ch+gl (-0.031172)
6. feature 7 smoke (-0.027784)
7. feature 12 imt (0.023266)
8. feature 2 weight (0.015991)
9. feature 1 height (-0.014786)
10. feature 14 w-h+100 (-0.012969)
11. feature 0 age (0.009825)
12. feature 4 ap_lo (-0.009407)
13. feature 10 ap_delta (0.005307)
14. feature 3 ap_hi (-0.004212)
15. feature 16 (4*ch+gl)*ap_avg (-0.001119)
16. feature 15 imt*ap_avg (-0.000258)
In [67]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(10,5))
examine_scaled(clf, d)
Log Loss: -0.5415010024672596, std: 0.005026404506139066
In [32]:
# sns.distplot(data['age'])
sns.distplot(data[data['cardio'] == 1]['age'], color='red')
sns.distplot(data[data['cardio'] != 1]['age'], color='green')
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f966470>
In [107]:
sns.distplot(data[data['cardio'] == 1]['height'])
sns.distplot(data[data['cardio'] != 1]['height'])
Out[107]:
<matplotlib.axes._subplots.AxesSubplot at 0x113e233c8>
In [115]:
sns.distplot(d[d['cardio'] == 1]['weight'], color='red')
sns.distplot(d[d['cardio'] != 1]['weight'], color='green')
Out[115]:
<matplotlib.axes._subplots.AxesSubplot at 0x1161b5978>
In [114]:
sns.distplot(d[(d['cardio'] == 1)]['imt'], color='red')
sns.distplot(d[(d['cardio'] == 0)]['imt'], color='green')
Out[114]:
<matplotlib.axes._subplots.AxesSubplot at 0x113cbd128>
In [116]:
print(d['ap_hi'].min())
print(d['ap_hi'].max())
sns.distplot(d[d['ap_hi'] < 250]['ap_hi'])
1
16020
Out[116]:
<matplotlib.axes._subplots.AxesSubplot at 0x1160cb320>
In [126]:
sns.distplot(d[(d['cardio'] == 1) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='red')
sns.distplot(d[(d['cardio'] == 0) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='green')
Out[126]:
<matplotlib.axes._subplots.AxesSubplot at 0x1198b3400>
In [271]:
sns.distplot(data[(data['cardio'] == 1)]['gluc'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['gluc'], color='green', kde=False)
Out[271]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a5ec400>
In [272]:
sns.distplot(data[(data['cardio'] == 1)]['cholesterol'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['cholesterol'], color='green', kde=False)
Out[272]:
<matplotlib.axes._subplots.AxesSubplot at 0x1211d4400>
In [282]:
d2 = data.copy()
d2['4*ch+gl'] = 4 * d2['cholesterol'] + d2['gluc']

sns.distplot(d2[(d2['cardio'] == 1)]['4*ch+gl'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['4*ch+gl'], color='green', kde=False)
Out[282]:
<matplotlib.axes._subplots.AxesSubplot at 0x117a24f28>
In [307]:
d2 = d.copy()
d2['w-h+100'] = d2['weight'] - (d2['height'] - 105)

sns.distplot(d2[(d2['cardio'] == 1)]['w-h+100'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['w-h+100'], color='green', kde=False)
Out[307]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f71f7f0>
In [309]:
def scatter(df, x, y):
    plt.scatter(df[(df['cardio'] == 1)][x], df[(df['cardio'] == 1)][y], color='red', alpha=0.5)
    plt.scatter(df[(df['cardio'] == 0)][x], df[(df['cardio'] == 0)][y], color='green', alpha=0.5)
In [312]:
scatter(d2, 'w-h+100', 'ap_avg')
In [64]:
d2 = d.copy()
sns.heatmap(d2.corr(), square=True)
Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x11587f400>
In [347]:
# d.groupby(by='cholesterol')['ap_avg'].median()
d.groupby(by='smoke')['ap_avg'].mean()
Out[347]:
smoke
0    96.652338
1    97.807198
Name: ap_avg, dtype: float64
In [40]:
def trend(df, column):
    h = df.groupby([column])['cardio'].sum() / d.groupby([column])['cardio'].count()
    sns.jointplot(h.index, h.values, kind="reg")
In [42]:
trend(d, 'ap_avg')
trend(d, 'ap_delta')
In [50]:
trend(d, 'imt*ap_avg')
In [54]:
trend(d, '4*ch+gl')
trend(d, 'w-h+100')
In [57]:
trend(d, '(4*ch+gl)*ap_avg')
In [44]:
def predict(clf, fname='submission_baseline_2.csv'):
    train = prepare(pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id'))
    test = prepare(pd.read_csv('data/test.csv.gz', compression='gzip', delimiter=';', index_col='id'))

    clf.fit(get_X(train), get_Y(train))
    pred = clf.predict_proba(test)

    result = pd.DataFrame(index=test.index)
    result['cardio'] = pred[:, 1]
    result.to_csv(fname, header=None, index=None)
In [37]:
predict(XGBClassifier(seed=42, nthread=4, n_estimators=1000, max_depth=6))
In [45]:
from sklearn.ensemble import VotingClassifier

estimators = [('xgb_{}'.format(i), XGBClassifier(seed=i * 42, nthread=4, n_estimators=500, max_depth=(3+(i%3)))) for i in range(50)]

clf = VotingClassifier(estimators, voting='soft')
predict(clf, 'submission_ensemble_1.csv')