import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = 14, 10 # увеличиваем размер картинок
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
Объективные признаки:
Результаты измерения:
Субъективные признаки:
data = pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id')
data.head()
data.describe()
def get_X(df):
return df.drop(['cardio'], axis=1)
def get_Y(df):
return df['cardio']
def examine(clf, df, random_state=42, scoring='neg_log_loss'):
X, y = get_X(df), get_Y(df)
folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std()))
from sklearn.metrics import make_scorer, log_loss
log_loss_score = make_scorer(log_loss, greater_is_better=False)
def examine_scaled(clf, df, random_state=42, scoring='neg_log_loss'):
X, y = get_X(df), get_Y(df)
X = StandardScaler().fit_transform(X)
folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std()))
def describeImportance(clf, X):
indices = np.argsort(clf.feature_importances_)[::-1]
for f in range(X.shape[1]):
print('%d. feature %d %s (%f)' % (f + 1, indices[f], X.columns[indices[f]],
clf.feature_importances_[indices[f]]))
def describeCoef(clf, X):
coefs = clf.coef_
indices = np.argsort(np.abs(coefs))[::-1]
for f in range(X.shape[1]):
print('%d. feature %d %s (%f)' % (f, indices[f], X.columns[indices[f]], coefs[indices[f]]))
clf = Ridge()
examine_scaled(clf, data, scoring=log_loss_score)
clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, data)
clf = XGBClassifier(seed=42, nthread=4)
examine(clf, data)
clf = DummyClassifier()
examine(clf, data)
data[data['height'] > 200]
data[(data['height'] < 100) & (data['weight'] >= 100)]
sns.violinplot(y='height', x='gender', data=data[(data['height'] < 190) & (data['height'] > 140)])
def prepare(df):
df = df.copy()
df.replace('None', 0, inplace=True)
df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']] = df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']].apply(pd.to_numeric)
df['age'] = df['age'] // 365.25
df['ap_hi'] = np.abs(df['ap_hi'])
df['ap_lo'] = np.abs(df['ap_lo'])
df.loc[df['ap_hi'] >= 5000, 'ap_hi'] /= 100
df.loc[df['ap_hi'] >= 300, 'ap_hi'] /= 10
df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
df.loc[df['ap_lo'] >= 5000, 'ap_lo'] /= 100
df.loc[df['ap_lo'] >= 300, 'ap_lo'] /= 10
df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
df.loc[df['ap_lo'] < 1, 'ap_lo'] = 70
index = (df['ap_lo'] > df['ap_hi'])
ap_lo = df.loc[index, 'ap_lo']
ap_hi = df.loc[index, 'ap_hi']
df.loc[index, 'ap_hi'] = ap_lo
df.loc[index, 'ap_lo'] = ap_hi
# df['ap_delta'] = np.abs(df['ap_hi'] - df['ap_lo'])
df['ap_avg'] = np.round((df['ap_hi'] + 2 * df['ap_lo']) / 3)
df.loc[df['height'] > 210, 'height'] -= 100
df.loc[(df['height'] < 100) & (df['weight'] < 100), 'height'] += 100
index = (df['height'] < 100) & (df['weight'] >= 100)
h = df.loc[index, 'height']
w = df.loc[index, 'weight']
df.loc[index, 'height'] = w
df.loc[index, 'weight'] = h
df['imt'] = np.round(df['weight'] / ((df['height'] / 100.0) ** 2))
df['4*ch+gl'] = 4 * df['cholesterol'] + df['gluc']
df['w-h+100'] = df['weight'] - (df['height'] - 105)
# df = pd.get_dummies(df, columns=['gender', 'cholesterol', 'gluc'])
# ch_labels = df.groupby(by='cholesterol')['ap_avg'].median()
# df['cholesterol-ap_avg'] = df['cholesterol'].apply(lambda e: ch_labels[e])
# alco_labels = df.groupby(by='alco')['ap_avg'].median()
# df['alco-ap_avg'] = df['alco'].apply(lambda e: alco_labels[e])
# smoke_labels = df.groupby(by='smoke')['ap_avg'].median()
# df['smoke-ap_avg'] = df['smoke'].apply(lambda e: smoke_labels[e])
df['imt*ap_avg'] = df['imt'] * df['ap_avg']
# df['(4*ch+gl)*ap_avg'] = df['4*ch+gl'] * df['ap_avg']
return df.drop(['gender'], axis=1)
d = prepare(data)
d.head()
d.describe()
#0.5388669688674089
clf = XGBClassifier(seed=42, nthread=4, n_estimators=100, max_depth=5)
examine(clf, d)
#0.555136801754552
clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, d)
clf = Lasso(normalize=True, alpha=0.0000001)
examine_scaled(clf, d, scoring=log_loss_score)
clf.fit(get_X(d), get_Y(d))
describeCoef(clf, get_X(d))
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(10,5))
examine_scaled(clf, d)
# sns.distplot(data['age'])
sns.distplot(data[data['cardio'] == 1]['age'], color='red')
sns.distplot(data[data['cardio'] != 1]['age'], color='green')
sns.distplot(data[data['cardio'] == 1]['height'])
sns.distplot(data[data['cardio'] != 1]['height'])
sns.distplot(d[d['cardio'] == 1]['weight'], color='red')
sns.distplot(d[d['cardio'] != 1]['weight'], color='green')
sns.distplot(d[(d['cardio'] == 1)]['imt'], color='red')
sns.distplot(d[(d['cardio'] == 0)]['imt'], color='green')
print(d['ap_hi'].min())
print(d['ap_hi'].max())
sns.distplot(d[d['ap_hi'] < 250]['ap_hi'])
sns.distplot(d[(d['cardio'] == 1) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='red')
sns.distplot(d[(d['cardio'] == 0) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='green')
sns.distplot(data[(data['cardio'] == 1)]['gluc'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['gluc'], color='green', kde=False)
sns.distplot(data[(data['cardio'] == 1)]['cholesterol'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['cholesterol'], color='green', kde=False)
d2 = data.copy()
d2['4*ch+gl'] = 4 * d2['cholesterol'] + d2['gluc']
sns.distplot(d2[(d2['cardio'] == 1)]['4*ch+gl'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['4*ch+gl'], color='green', kde=False)
d2 = d.copy()
d2['w-h+100'] = d2['weight'] - (d2['height'] - 105)
sns.distplot(d2[(d2['cardio'] == 1)]['w-h+100'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['w-h+100'], color='green', kde=False)
def scatter(df, x, y):
plt.scatter(df[(df['cardio'] == 1)][x], df[(df['cardio'] == 1)][y], color='red', alpha=0.5)
plt.scatter(df[(df['cardio'] == 0)][x], df[(df['cardio'] == 0)][y], color='green', alpha=0.5)
scatter(d2, 'w-h+100', 'ap_avg')
d2 = d.copy()
sns.heatmap(d2.corr(), square=True)
# d.groupby(by='cholesterol')['ap_avg'].median()
d.groupby(by='smoke')['ap_avg'].mean()
def trend(df, column):
h = df.groupby([column])['cardio'].sum() / d.groupby([column])['cardio'].count()
sns.jointplot(h.index, h.values, kind="reg")
trend(d, 'ap_avg')
trend(d, 'ap_delta')
trend(d, 'imt*ap_avg')
trend(d, '4*ch+gl')
trend(d, 'w-h+100')
trend(d, '(4*ch+gl)*ap_avg')
def predict(clf, fname='submission_baseline_2.csv'):
train = prepare(pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id'))
test = prepare(pd.read_csv('data/test.csv.gz', compression='gzip', delimiter=';', index_col='id'))
clf.fit(get_X(train), get_Y(train))
pred = clf.predict_proba(test)
result = pd.DataFrame(index=test.index)
result['cardio'] = pred[:, 1]
result.to_csv(fname, header=None, index=None)
predict(XGBClassifier(seed=42, nthread=4, n_estimators=1000, max_depth=6))
from sklearn.ensemble import VotingClassifier
estimators = [('xgb_{}'.format(i), XGBClassifier(seed=i * 42, nthread=4, n_estimators=500, max_depth=(3+(i%3)))) for i in range(50)]
clf = VotingClassifier(estimators, voting='soft')
predict(clf, 'submission_ensemble_1.csv')