#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt plt.rcParams['figure.figsize'] = 14, 10 # увеличиваем размер картинок import seaborn as sns from sklearn.model_selection import cross_val_score, KFold from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression from xgboost import XGBClassifier from sklearn.dummy import DummyClassifier from sklearn.preprocessing import StandardScaler # Объективные признаки: # # - Возраст # - Рост # - Вес # - Пол # # # Результаты измерения: # # - Артериальное давление верхнее и нижнее # - Холестерин # - Глюкоза # # # Субъективные признаки: # # - Курение # - Употребление Алкоголя # - Физическая активность # In[2]: data = pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id') data.head() # In[6]: data.describe() # In[3]: def get_X(df): return df.drop(['cardio'], axis=1) def get_Y(df): return df['cardio'] # In[4]: def examine(clf, df, random_state=42, scoring='neg_log_loss'): X, y = get_X(df), get_Y(df) folds = KFold(n_splits=5, shuffle=True, random_state=random_state) scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring) print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std())) # In[5]: from sklearn.metrics import make_scorer, log_loss log_loss_score = make_scorer(log_loss, greater_is_better=False) # In[5]: def examine_scaled(clf, df, random_state=42, scoring='neg_log_loss'): X, y = get_X(df), get_Y(df) X = StandardScaler().fit_transform(X) folds = KFold(n_splits=5, shuffle=True, random_state=random_state) scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring) print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std())) # In[6]: def describeImportance(clf, X): indices = np.argsort(clf.feature_importances_)[::-1] for f in range(X.shape[1]): print('%d. feature %d %s (%f)' % (f + 1, indices[f], X.columns[indices[f]], clf.feature_importances_[indices[f]])) def describeCoef(clf, X): coefs = clf.coef_ indices = np.argsort(np.abs(coefs))[::-1] for f in range(X.shape[1]): print('%d. feature %d %s (%f)' % (f, indices[f], X.columns[indices[f]], coefs[indices[f]])) # In[12]: clf = Ridge() examine_scaled(clf, data, scoring=log_loss_score) # In[14]: clf = LogisticRegression(n_jobs=4, random_state=42) examine_scaled(clf, data) # In[15]: clf = XGBClassifier(seed=42, nthread=4) examine(clf, data) # In[14]: clf = DummyClassifier() examine(clf, data) # In[42]: data[data['height'] > 200] # In[67]: data[(data['height'] < 100) & (data['weight'] >= 100)] # In[57]: sns.violinplot(y='height', x='gender', data=data[(data['height'] < 190) & (data['height'] > 140)]) # In[33]: def prepare(df): df = df.copy() df.replace('None', 0, inplace=True) df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']] = df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']].apply(pd.to_numeric) df['age'] = df['age'] // 365.25 df['ap_hi'] = np.abs(df['ap_hi']) df['ap_lo'] = np.abs(df['ap_lo']) df.loc[df['ap_hi'] >= 5000, 'ap_hi'] /= 100 df.loc[df['ap_hi'] >= 300, 'ap_hi'] /= 10 df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10 df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10 df.loc[df['ap_lo'] >= 5000, 'ap_lo'] /= 100 df.loc[df['ap_lo'] >= 300, 'ap_lo'] /= 10 df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10 df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10 df.loc[df['ap_lo'] < 1, 'ap_lo'] = 70 index = (df['ap_lo'] > df['ap_hi']) ap_lo = df.loc[index, 'ap_lo'] ap_hi = df.loc[index, 'ap_hi'] df.loc[index, 'ap_hi'] = ap_lo df.loc[index, 'ap_lo'] = ap_hi # df['ap_delta'] = np.abs(df['ap_hi'] - df['ap_lo']) df['ap_avg'] = np.round((df['ap_hi'] + 2 * df['ap_lo']) / 3) df.loc[df['height'] > 210, 'height'] -= 100 df.loc[(df['height'] < 100) & (df['weight'] < 100), 'height'] += 100 index = (df['height'] < 100) & (df['weight'] >= 100) h = df.loc[index, 'height'] w = df.loc[index, 'weight'] df.loc[index, 'height'] = w df.loc[index, 'weight'] = h df['imt'] = np.round(df['weight'] / ((df['height'] / 100.0) ** 2)) df['4*ch+gl'] = 4 * df['cholesterol'] + df['gluc'] df['w-h+100'] = df['weight'] - (df['height'] - 105) # df = pd.get_dummies(df, columns=['gender', 'cholesterol', 'gluc']) # ch_labels = df.groupby(by='cholesterol')['ap_avg'].median() # df['cholesterol-ap_avg'] = df['cholesterol'].apply(lambda e: ch_labels[e]) # alco_labels = df.groupby(by='alco')['ap_avg'].median() # df['alco-ap_avg'] = df['alco'].apply(lambda e: alco_labels[e]) # smoke_labels = df.groupby(by='smoke')['ap_avg'].median() # df['smoke-ap_avg'] = df['smoke'].apply(lambda e: smoke_labels[e]) df['imt*ap_avg'] = df['imt'] * df['ap_avg'] # df['(4*ch+gl)*ap_avg'] = df['4*ch+gl'] * df['ap_avg'] return df.drop(['gender'], axis=1) d = prepare(data) d.head() # In[9]: d.describe() # In[35]: #0.5388669688674089 clf = XGBClassifier(seed=42, nthread=4, n_estimators=100, max_depth=5) examine(clf, d) # In[36]: #0.555136801754552 clf = LogisticRegression(n_jobs=4, random_state=42) examine_scaled(clf, d) # In[66]: clf = Lasso(normalize=True, alpha=0.0000001) examine_scaled(clf, d, scoring=log_loss_score) clf.fit(get_X(d), get_Y(d)) describeCoef(clf, get_X(d)) # In[67]: from sklearn.neural_network import MLPClassifier clf = MLPClassifier(hidden_layer_sizes=(10,5)) examine_scaled(clf, d) # In[32]: # sns.distplot(data['age']) sns.distplot(data[data['cardio'] == 1]['age'], color='red') sns.distplot(data[data['cardio'] != 1]['age'], color='green') # In[107]: sns.distplot(data[data['cardio'] == 1]['height']) sns.distplot(data[data['cardio'] != 1]['height']) # In[115]: sns.distplot(d[d['cardio'] == 1]['weight'], color='red') sns.distplot(d[d['cardio'] != 1]['weight'], color='green') # In[114]: sns.distplot(d[(d['cardio'] == 1)]['imt'], color='red') sns.distplot(d[(d['cardio'] == 0)]['imt'], color='green') # In[116]: print(d['ap_hi'].min()) print(d['ap_hi'].max()) sns.distplot(d[d['ap_hi'] < 250]['ap_hi']) # In[126]: sns.distplot(d[(d['cardio'] == 1) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='red') sns.distplot(d[(d['cardio'] == 0) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='green') # In[271]: sns.distplot(data[(data['cardio'] == 1)]['gluc'], color='red', kde=False) sns.distplot(data[(data['cardio'] == 0)]['gluc'], color='green', kde=False) # In[272]: sns.distplot(data[(data['cardio'] == 1)]['cholesterol'], color='red', kde=False) sns.distplot(data[(data['cardio'] == 0)]['cholesterol'], color='green', kde=False) # In[282]: d2 = data.copy() d2['4*ch+gl'] = 4 * d2['cholesterol'] + d2['gluc'] sns.distplot(d2[(d2['cardio'] == 1)]['4*ch+gl'], color='red', kde=False) sns.distplot(d2[(d2['cardio'] == 0)]['4*ch+gl'], color='green', kde=False) # In[307]: d2 = d.copy() d2['w-h+100'] = d2['weight'] - (d2['height'] - 105) sns.distplot(d2[(d2['cardio'] == 1)]['w-h+100'], color='red', kde=False) sns.distplot(d2[(d2['cardio'] == 0)]['w-h+100'], color='green', kde=False) # In[309]: def scatter(df, x, y): plt.scatter(df[(df['cardio'] == 1)][x], df[(df['cardio'] == 1)][y], color='red', alpha=0.5) plt.scatter(df[(df['cardio'] == 0)][x], df[(df['cardio'] == 0)][y], color='green', alpha=0.5) # In[312]: scatter(d2, 'w-h+100', 'ap_avg') # In[64]: d2 = d.copy() sns.heatmap(d2.corr(), square=True) # In[347]: # d.groupby(by='cholesterol')['ap_avg'].median() d.groupby(by='smoke')['ap_avg'].mean() # In[40]: def trend(df, column): h = df.groupby([column])['cardio'].sum() / d.groupby([column])['cardio'].count() sns.jointplot(h.index, h.values, kind="reg") # In[42]: trend(d, 'ap_avg') trend(d, 'ap_delta') # In[50]: trend(d, 'imt*ap_avg') # In[54]: trend(d, '4*ch+gl') trend(d, 'w-h+100') # In[57]: trend(d, '(4*ch+gl)*ap_avg') # In[44]: def predict(clf, fname='submission_baseline_2.csv'): train = prepare(pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id')) test = prepare(pd.read_csv('data/test.csv.gz', compression='gzip', delimiter=';', index_col='id')) clf.fit(get_X(train), get_Y(train)) pred = clf.predict_proba(test) result = pd.DataFrame(index=test.index) result['cardio'] = pred[:, 1] result.to_csv(fname, header=None, index=None) # In[37]: predict(XGBClassifier(seed=42, nthread=4, n_estimators=1000, max_depth=6)) # In[45]: from sklearn.ensemble import VotingClassifier estimators = [('xgb_{}'.format(i), XGBClassifier(seed=i * 42, nthread=4, n_estimators=500, max_depth=(3+(i%3)))) for i in range(50)] clf = VotingClassifier(estimators, voting='soft') predict(clf, 'submission_ensemble_1.csv')