In [1]:

import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = 14, 10 # увеличиваем размер картинок
import seaborn as sns

from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler

Объективные признаки:

Возраст
Рост
Вес
Пол

Результаты измерения:

Артериальное давление верхнее и нижнее
Холестерин
Глюкоза

Субъективные признаки:

Курение
Употребление Алкоголя
Физическая активность

In [2]:

data = pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id')
data.head()

Out[2]:

	age	gender	height	weight	ap_hi	ap_lo	cholesterol	gluc	smoke	alco	active	cardio
id
0	18393	2	168	62.0	110	80	1	1	0	0	1	0
1	20228	1	156	85.0	140	90	3	1	0	0	1	1
2	18857	1	165	64.0	130	70	3	1	0	0	0	1
3	17623	2	169	82.0	150	100	1	1	0	0	1	1
4	17474	1	156	56.0	100	60	1	1	0	0	0	0

In [6]:

data.describe()

Out[6]:

	age	gender	height	weight	ap_hi	ap_lo	cholesterol	gluc	smoke	alco	active	cardio
count	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000
mean	19468.865814	1.349571	164.359229	74.205690	128.817286	96.630414	1.366871	1.226457	0.088129	0.053771	0.803729	0.499700
std	2467.251667	0.476838	8.210126	14.395757	154.011419	188.472530	0.680250	0.572270	0.283484	0.225568	0.397179	0.500003
min	10798.000000	1.000000	55.000000	10.000000	-150.000000	-70.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000
25%	17664.000000	1.000000	159.000000	65.000000	120.000000	80.000000	1.000000	1.000000	0.000000	0.000000	1.000000	0.000000
50%	19703.000000	1.000000	165.000000	72.000000	120.000000	80.000000	1.000000	1.000000	0.000000	0.000000	1.000000	0.000000
75%	21327.000000	2.000000	170.000000	82.000000	140.000000	90.000000	2.000000	1.000000	0.000000	0.000000	1.000000	1.000000
max	23713.000000	2.000000	250.000000	200.000000	16020.000000	11000.000000	3.000000	3.000000	1.000000	1.000000	1.000000	1.000000

In [3]:

def get_X(df):
    return df.drop(['cardio'], axis=1)

def get_Y(df):
    return df['cardio']

In [4]:

def examine(clf, df, random_state=42, scoring='neg_log_loss'):
    X, y = get_X(df), get_Y(df) 
    folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
    print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std())) 

In [5]:

from sklearn.metrics import make_scorer, log_loss
log_loss_score = make_scorer(log_loss, greater_is_better=False)

In [5]:

def examine_scaled(clf, df, random_state=42, scoring='neg_log_loss'):
    X, y = get_X(df), get_Y(df)
    X = StandardScaler().fit_transform(X)
    folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
    print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std())) 

In [6]:

def describeImportance(clf, X):
    indices = np.argsort(clf.feature_importances_)[::-1]
    for f in range(X.shape[1]):
        print('%d. feature %d %s (%f)' % (f + 1, indices[f], X.columns[indices[f]],
                                          clf.feature_importances_[indices[f]]))
        
def describeCoef(clf, X):
    coefs = clf.coef_
    indices = np.argsort(np.abs(coefs))[::-1]
    for f in range(X.shape[1]):
        print('%d. feature %d %s (%f)' % (f, indices[f], X.columns[indices[f]], coefs[indices[f]]))

In [12]:

clf = Ridge()
examine_scaled(clf, data, scoring=log_loss_score)

Log Loss: -0.6588166038251394, std: 0.016590990820719284

In [14]:

clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, data)

Log Loss: -0.5779710936401263, std: 0.004090232271378004

In [15]:

clf = XGBClassifier(seed=42, nthread=4)
examine(clf, data)

Log Loss: -0.5396301910766, std: 0.005472783959989331

In [14]:

clf = DummyClassifier()
examine(clf, data)

Log Loss: -17.278269597099744, std: 0.10291377356163346

In [42]:

data[data['height'] > 200]

Out[42]:

	age	gender	height	weight	ap_hi	ap_lo	cholesterol	gluc	smoke	alco	active	cardio
id
9223	21220	1	250	86.0	140	100	3	1	0	0	1	1
30894	19054	2	207	78.0	100	70	1	1	0	1	1	0

In [67]:

data[(data['height'] < 100) & (data['weight'] >= 100)]

Out[67]:

	age	gender	height	weight	ap_hi	ap_lo	cholesterol	gluc	smoke	alco	active	cardio
id
11662	17646	2	97	170.0	160	100	1	1	1	0	1	1
18218	19594	1	75	168.0	120	80	1	1	1	0	1	1
34186	19074	1	81	156.0	140	90	1	1	0	0	1	0
39156	15292	1	80	178.0	140	90	3	3	0	0	1	1

In [57]:

sns.violinplot(y='height', x='gender', data=data[(data['height'] < 190) & (data['height'] > 140)])

Out[57]:

<matplotlib.axes._subplots.AxesSubplot at 0x1115396a0>

In [33]:

def prepare(df):
    df = df.copy()
    df.replace('None', 0, inplace=True)
    df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']] = df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']].apply(pd.to_numeric)
    df['age'] = df['age'] // 365.25
    df['ap_hi'] = np.abs(df['ap_hi'])
    df['ap_lo'] = np.abs(df['ap_lo'])
    df.loc[df['ap_hi'] >= 5000, 'ap_hi'] /= 100
    df.loc[df['ap_hi'] >= 300, 'ap_hi'] /= 10
    df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
    df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
    df.loc[df['ap_lo'] >= 5000, 'ap_lo'] /= 100
    df.loc[df['ap_lo'] >= 300, 'ap_lo'] /= 10
    df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
    df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
    df.loc[df['ap_lo'] < 1, 'ap_lo'] = 70
    
    index = (df['ap_lo'] > df['ap_hi'])
    ap_lo = df.loc[index, 'ap_lo']
    ap_hi = df.loc[index, 'ap_hi']
    df.loc[index, 'ap_hi'] = ap_lo
    df.loc[index, 'ap_lo'] = ap_hi
    
#     df['ap_delta'] = np.abs(df['ap_hi'] - df['ap_lo'])
    df['ap_avg'] = np.round((df['ap_hi'] + 2 * df['ap_lo']) / 3)
    df.loc[df['height'] > 210, 'height'] -= 100
    df.loc[(df['height'] < 100) & (df['weight'] < 100), 'height'] += 100
    
    index = (df['height'] < 100) & (df['weight'] >= 100)
    h = df.loc[index, 'height']
    w = df.loc[index, 'weight']
    df.loc[index, 'height'] = w
    df.loc[index, 'weight'] = h
    
    df['imt'] = np.round(df['weight'] / ((df['height'] / 100.0) ** 2))
    
    df['4*ch+gl'] = 4 * df['cholesterol'] + df['gluc']
    df['w-h+100'] = df['weight'] - (df['height'] - 105)
    
#     df = pd.get_dummies(df, columns=['gender', 'cholesterol', 'gluc'])
#     ch_labels = df.groupby(by='cholesterol')['ap_avg'].median()
#     df['cholesterol-ap_avg'] = df['cholesterol'].apply(lambda e: ch_labels[e])

#     alco_labels = df.groupby(by='alco')['ap_avg'].median()
#     df['alco-ap_avg'] = df['alco'].apply(lambda e: alco_labels[e])
    
#     smoke_labels = df.groupby(by='smoke')['ap_avg'].median()
#     df['smoke-ap_avg'] = df['smoke'].apply(lambda e: smoke_labels[e])
    
    df['imt*ap_avg'] = df['imt'] * df['ap_avg']
#     df['(4*ch+gl)*ap_avg'] = df['4*ch+gl'] * df['ap_avg']
    return df.drop(['gender'], axis=1)


d = prepare(data)
d.head()

Out[33]:

	age	height	weight	ap_hi	ap_lo	cholesterol	gluc	smoke	alco	active	cardio	ap_delta	ap_avg	imt	4*ch+gl	w-h+100	imt*ap_avg
id
0	50.0	168.0	62.0	110.0	80.0	1	1	0	0	1	0	30.0	90.0	22.0	5	-1.0	1980.0
1	55.0	156.0	85.0	140.0	90.0	3	1	0	0	1	1	50.0	107.0	35.0	13	34.0	3745.0
2	51.0	165.0	64.0	130.0	70.0	3	1	0	0	0	1	60.0	90.0	24.0	13	4.0	2160.0
3	48.0	169.0	82.0	150.0	100.0	1	1	0	0	1	1	50.0	117.0	29.0	5	18.0	3393.0
4	47.0	156.0	56.0	100.0	60.0	1	1	0	0	0	0	40.0	73.0	23.0	5	5.0	1679.0

In [9]:

d.describe()

Out[9]:

	age	height	weight	ap_hi	ap_lo	gluc	smoke	alco	active	cardio	ap_delta	ap_avg	imt	4*ch+gl	w-h+100	imt*ap_avg	(4ch+gl)ap_avg
count	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000	70000.000000
mean	52.803257	164.398357	74.200847	127.082957	81.589693	1.226457	0.088129	0.053771	0.803729	0.499700	45.493265	96.656643	27.494843	6.693943	14.802490	2674.266014	653.579957
std	6.762462	7.992059	14.378460	17.084971	9.779714	0.572270	0.283484	0.225568	0.397179	0.500003	11.827921	11.471790	5.383962	3.022861	14.197317	697.920760	327.402713
min	29.000000	100.000000	10.000000	60.000000	24.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	44.000000	3.000000	5.000000	-62.000000	309.000000	220.000000
25%	48.000000	159.000000	65.000000	120.000000	80.000000	1.000000	0.000000	0.000000	1.000000	0.000000	40.000000	93.000000	24.000000	5.000000	5.000000	2232.000000	465.000000
50%	53.000000	165.000000	72.000000	120.000000	80.000000	1.000000	0.000000	0.000000	1.000000	0.000000	40.000000	93.000000	26.000000	5.000000	12.000000	2520.000000	490.000000
75%	58.000000	170.000000	82.000000	140.000000	90.000000	1.000000	0.000000	0.000000	1.000000	1.000000	50.000000	103.000000	30.000000	9.000000	23.000000	3000.000000	738.000000
max	64.000000	207.000000	200.000000	240.000000	182.000000	3.000000	1.000000	1.000000	1.000000	1.000000	176.000000	187.000000	153.000000	15.000000	166.000000	19551.000000	2595.000000

In [35]:

#0.5388669688674089
clf = XGBClassifier(seed=42, nthread=4, n_estimators=100, max_depth=5)
examine(clf, d)

Log Loss: -0.5389820855843168, std: 0.0055894181462354435

In [36]:

#0.555136801754552
clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, d)

Log Loss: -0.5574158467263418, std: 0.00483120321114506

In [66]:

clf = Lasso(normalize=True, alpha=0.0000001)
examine_scaled(clf, d, scoring=log_loss_score)
clf.fit(get_X(d), get_Y(d))
describeCoef(clf, get_X(d))

/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/usr/local/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)

Log Loss: -0.6805987237215099, std: 0.02503242794013113
0. feature 5 cholesterol (0.667676)
1. feature 6 gluc (0.117905)
2. feature 9 active (-0.041944)
3. feature 11 ap_avg (0.040548)
4. feature 8 alco (-0.037981)
5. feature 13 4*ch+gl (-0.031172)
6. feature 7 smoke (-0.027784)
7. feature 12 imt (0.023266)
8. feature 2 weight (0.015991)
9. feature 1 height (-0.014786)
10. feature 14 w-h+100 (-0.012969)
11. feature 0 age (0.009825)
12. feature 4 ap_lo (-0.009407)
13. feature 10 ap_delta (0.005307)
14. feature 3 ap_hi (-0.004212)
15. feature 16 (4*ch+gl)*ap_avg (-0.001119)
16. feature 15 imt*ap_avg (-0.000258)

In [67]:

from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(10,5))
examine_scaled(clf, d)

Log Loss: -0.5415010024672596, std: 0.005026404506139066

In [32]:

# sns.distplot(data['age'])
sns.distplot(data[data['cardio'] == 1]['age'], color='red')
sns.distplot(data[data['cardio'] != 1]['age'], color='green')

Out[32]:

<matplotlib.axes._subplots.AxesSubplot at 0x10f966470>

In [107]:

sns.distplot(data[data['cardio'] == 1]['height'])
sns.distplot(data[data['cardio'] != 1]['height'])

Out[107]:

<matplotlib.axes._subplots.AxesSubplot at 0x113e233c8>

In [115]:

sns.distplot(d[d['cardio'] == 1]['weight'], color='red')
sns.distplot(d[d['cardio'] != 1]['weight'], color='green')

Out[115]:

<matplotlib.axes._subplots.AxesSubplot at 0x1161b5978>

In [114]:

sns.distplot(d[(d['cardio'] == 1)]['imt'], color='red')
sns.distplot(d[(d['cardio'] == 0)]['imt'], color='green')

Out[114]:

<matplotlib.axes._subplots.AxesSubplot at 0x113cbd128>

In [116]:

print(d['ap_hi'].min())
print(d['ap_hi'].max())
sns.distplot(d[d['ap_hi'] < 250]['ap_hi'])

1
16020

Out[116]:

<matplotlib.axes._subplots.AxesSubplot at 0x1160cb320>

In [126]:

sns.distplot(d[(d['cardio'] == 1) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='red')
sns.distplot(d[(d['cardio'] == 0) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='green')

Out[126]:

<matplotlib.axes._subplots.AxesSubplot at 0x1198b3400>

In [271]:

sns.distplot(data[(data['cardio'] == 1)]['gluc'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['gluc'], color='green', kde=False)

Out[271]:

<matplotlib.axes._subplots.AxesSubplot at 0x11a5ec400>

In [272]:

sns.distplot(data[(data['cardio'] == 1)]['cholesterol'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['cholesterol'], color='green', kde=False)

Out[272]:

<matplotlib.axes._subplots.AxesSubplot at 0x1211d4400>

In [282]:

d2 = data.copy()
d2['4*ch+gl'] = 4 * d2['cholesterol'] + d2['gluc']

sns.distplot(d2[(d2['cardio'] == 1)]['4*ch+gl'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['4*ch+gl'], color='green', kde=False)

Out[282]:

<matplotlib.axes._subplots.AxesSubplot at 0x117a24f28>

In [307]:

d2 = d.copy()
d2['w-h+100'] = d2['weight'] - (d2['height'] - 105)

sns.distplot(d2[(d2['cardio'] == 1)]['w-h+100'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['w-h+100'], color='green', kde=False)

Out[307]:

<matplotlib.axes._subplots.AxesSubplot at 0x11f71f7f0>

In [309]:

def scatter(df, x, y):
    plt.scatter(df[(df['cardio'] == 1)][x], df[(df['cardio'] == 1)][y], color='red', alpha=0.5)
    plt.scatter(df[(df['cardio'] == 0)][x], df[(df['cardio'] == 0)][y], color='green', alpha=0.5)

In [312]:

scatter(d2, 'w-h+100', 'ap_avg')

In [64]:

d2 = d.copy()
sns.heatmap(d2.corr(), square=True)

Out[64]:

<matplotlib.axes._subplots.AxesSubplot at 0x11587f400>

In [347]:

# d.groupby(by='cholesterol')['ap_avg'].median()
d.groupby(by='smoke')['ap_avg'].mean()

Out[347]:

smoke
0    96.652338
1    97.807198
Name: ap_avg, dtype: float64

In [40]:

def trend(df, column):
    h = df.groupby([column])['cardio'].sum() / d.groupby([column])['cardio'].count()
    sns.jointplot(h.index, h.values, kind="reg")

In [42]:

trend(d, 'ap_avg')
trend(d, 'ap_delta')

In [50]:

trend(d, 'imt*ap_avg')

In [54]:

trend(d, '4*ch+gl')
trend(d, 'w-h+100')

In [57]:

trend(d, '(4*ch+gl)*ap_avg')

In [44]:

def predict(clf, fname='submission_baseline_2.csv'):
    train = prepare(pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id'))
    test = prepare(pd.read_csv('data/test.csv.gz', compression='gzip', delimiter=';', index_col='id'))

    clf.fit(get_X(train), get_Y(train))
    pred = clf.predict_proba(test)

    result = pd.DataFrame(index=test.index)
    result['cardio'] = pred[:, 1]
    result.to_csv(fname, header=None, index=None)

In [37]:

predict(XGBClassifier(seed=42, nthread=4, n_estimators=1000, max_depth=6))

In [45]:

from sklearn.ensemble import VotingClassifier

estimators = [('xgb_{}'.format(i), XGBClassifier(seed=i * 42, nthread=4, n_estimators=500, max_depth=(3+(i%3)))) for i in range(50)]

clf = VotingClassifier(estimators, voting='soft')
predict(clf, 'submission_ensemble_1.csv')