#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = 14, 10 # увеличиваем размер картинок
import seaborn as sns

from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler


# Объективные признаки:
# 
#  - Возраст
#  - Рост
#  - Вес
#  - Пол
#  
# 
# Результаты измерения:
# 
#  - Артериальное давление верхнее и нижнее
#  - Холестерин
#  - Глюкоза
#  
# 
# Субъективные признаки:
# 
#  - Курение
#  - Употребление Алкоголя
#  - Физическая активность

# In[2]:


data = pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id')
data.head()


# In[6]:


data.describe()


# In[3]:


def get_X(df):
    return df.drop(['cardio'], axis=1)

def get_Y(df):
    return df['cardio']


# In[4]:


def examine(clf, df, random_state=42, scoring='neg_log_loss'):
    X, y = get_X(df), get_Y(df) 
    folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
    print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std())) 


# In[5]:


from sklearn.metrics import make_scorer, log_loss
log_loss_score = make_scorer(log_loss, greater_is_better=False)


# In[5]:


def examine_scaled(clf, df, random_state=42, scoring='neg_log_loss'):
    X, y = get_X(df), get_Y(df)
    X = StandardScaler().fit_transform(X)
    folds = KFold(n_splits=5, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=folds, scoring=scoring)
    print("Log Loss: {}, std: {}".format(scores.mean(), 2 * scores.std())) 


# In[6]:


def describeImportance(clf, X):
    indices = np.argsort(clf.feature_importances_)[::-1]
    for f in range(X.shape[1]):
        print('%d. feature %d %s (%f)' % (f + 1, indices[f], X.columns[indices[f]],
                                          clf.feature_importances_[indices[f]]))
        
def describeCoef(clf, X):
    coefs = clf.coef_
    indices = np.argsort(np.abs(coefs))[::-1]
    for f in range(X.shape[1]):
        print('%d. feature %d %s (%f)' % (f, indices[f], X.columns[indices[f]], coefs[indices[f]]))


# In[12]:


clf = Ridge()
examine_scaled(clf, data, scoring=log_loss_score)


# In[14]:


clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, data)


# In[15]:


clf = XGBClassifier(seed=42, nthread=4)
examine(clf, data)


# In[14]:


clf = DummyClassifier()
examine(clf, data)


# In[42]:


data[data['height'] > 200]


# In[67]:


data[(data['height'] < 100) & (data['weight'] >= 100)]


# In[57]:


sns.violinplot(y='height', x='gender', data=data[(data['height'] < 190) & (data['height'] > 140)])


# In[33]:


def prepare(df):
    df = df.copy()
    df.replace('None', 0, inplace=True)
    df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']] = df[['age', 'ap_hi', 'ap_lo', 'height', 'weight', 'gender', 'active', 'alco', 'smoke']].apply(pd.to_numeric)
    df['age'] = df['age'] // 365.25
    df['ap_hi'] = np.abs(df['ap_hi'])
    df['ap_lo'] = np.abs(df['ap_lo'])
    df.loc[df['ap_hi'] >= 5000, 'ap_hi'] /= 100
    df.loc[df['ap_hi'] >= 300, 'ap_hi'] /= 10
    df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
    df.loc[df['ap_hi'] <= 20, 'ap_hi'] *= 10
    df.loc[df['ap_lo'] >= 5000, 'ap_lo'] /= 100
    df.loc[df['ap_lo'] >= 300, 'ap_lo'] /= 10
    df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
    df.loc[df['ap_lo'] <= 20, 'ap_lo'] *= 10
    df.loc[df['ap_lo'] < 1, 'ap_lo'] = 70
    
    index = (df['ap_lo'] > df['ap_hi'])
    ap_lo = df.loc[index, 'ap_lo']
    ap_hi = df.loc[index, 'ap_hi']
    df.loc[index, 'ap_hi'] = ap_lo
    df.loc[index, 'ap_lo'] = ap_hi
    
#     df['ap_delta'] = np.abs(df['ap_hi'] - df['ap_lo'])
    df['ap_avg'] = np.round((df['ap_hi'] + 2 * df['ap_lo']) / 3)
    df.loc[df['height'] > 210, 'height'] -= 100
    df.loc[(df['height'] < 100) & (df['weight'] < 100), 'height'] += 100
    
    index = (df['height'] < 100) & (df['weight'] >= 100)
    h = df.loc[index, 'height']
    w = df.loc[index, 'weight']
    df.loc[index, 'height'] = w
    df.loc[index, 'weight'] = h
    
    df['imt'] = np.round(df['weight'] / ((df['height'] / 100.0) ** 2))
    
    df['4*ch+gl'] = 4 * df['cholesterol'] + df['gluc']
    df['w-h+100'] = df['weight'] - (df['height'] - 105)
    
#     df = pd.get_dummies(df, columns=['gender', 'cholesterol', 'gluc'])
#     ch_labels = df.groupby(by='cholesterol')['ap_avg'].median()
#     df['cholesterol-ap_avg'] = df['cholesterol'].apply(lambda e: ch_labels[e])

#     alco_labels = df.groupby(by='alco')['ap_avg'].median()
#     df['alco-ap_avg'] = df['alco'].apply(lambda e: alco_labels[e])
    
#     smoke_labels = df.groupby(by='smoke')['ap_avg'].median()
#     df['smoke-ap_avg'] = df['smoke'].apply(lambda e: smoke_labels[e])
    
    df['imt*ap_avg'] = df['imt'] * df['ap_avg']
#     df['(4*ch+gl)*ap_avg'] = df['4*ch+gl'] * df['ap_avg']
    return df.drop(['gender'], axis=1)


d = prepare(data)
d.head()


# In[9]:


d.describe()


# In[35]:


#0.5388669688674089
clf = XGBClassifier(seed=42, nthread=4, n_estimators=100, max_depth=5)
examine(clf, d)


# In[36]:


#0.555136801754552
clf = LogisticRegression(n_jobs=4, random_state=42)
examine_scaled(clf, d)


# In[66]:


clf = Lasso(normalize=True, alpha=0.0000001)
examine_scaled(clf, d, scoring=log_loss_score)
clf.fit(get_X(d), get_Y(d))
describeCoef(clf, get_X(d))


# In[67]:


from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(10,5))
examine_scaled(clf, d)


# In[32]:


# sns.distplot(data['age'])
sns.distplot(data[data['cardio'] == 1]['age'], color='red')
sns.distplot(data[data['cardio'] != 1]['age'], color='green')


# In[107]:


sns.distplot(data[data['cardio'] == 1]['height'])
sns.distplot(data[data['cardio'] != 1]['height'])


# In[115]:


sns.distplot(d[d['cardio'] == 1]['weight'], color='red')
sns.distplot(d[d['cardio'] != 1]['weight'], color='green')


# In[114]:


sns.distplot(d[(d['cardio'] == 1)]['imt'], color='red')
sns.distplot(d[(d['cardio'] == 0)]['imt'], color='green')


# In[116]:


print(d['ap_hi'].min())
print(d['ap_hi'].max())
sns.distplot(d[d['ap_hi'] < 250]['ap_hi'])


# In[126]:


sns.distplot(d[(d['cardio'] == 1) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='red')
sns.distplot(d[(d['cardio'] == 0) & (d['delta_ap'] < 100) & (d['ap_lo'] > 90)]['delta_ap'], color='green')


# In[271]:


sns.distplot(data[(data['cardio'] == 1)]['gluc'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['gluc'], color='green', kde=False)


# In[272]:


sns.distplot(data[(data['cardio'] == 1)]['cholesterol'], color='red', kde=False)
sns.distplot(data[(data['cardio'] == 0)]['cholesterol'], color='green', kde=False)


# In[282]:


d2 = data.copy()
d2['4*ch+gl'] = 4 * d2['cholesterol'] + d2['gluc']

sns.distplot(d2[(d2['cardio'] == 1)]['4*ch+gl'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['4*ch+gl'], color='green', kde=False)


# In[307]:


d2 = d.copy()
d2['w-h+100'] = d2['weight'] - (d2['height'] - 105)

sns.distplot(d2[(d2['cardio'] == 1)]['w-h+100'], color='red', kde=False)
sns.distplot(d2[(d2['cardio'] == 0)]['w-h+100'], color='green', kde=False)


# In[309]:


def scatter(df, x, y):
    plt.scatter(df[(df['cardio'] == 1)][x], df[(df['cardio'] == 1)][y], color='red', alpha=0.5)
    plt.scatter(df[(df['cardio'] == 0)][x], df[(df['cardio'] == 0)][y], color='green', alpha=0.5)


# In[312]:


scatter(d2, 'w-h+100', 'ap_avg')


# In[64]:


d2 = d.copy()
sns.heatmap(d2.corr(), square=True)


# In[347]:


# d.groupby(by='cholesterol')['ap_avg'].median()
d.groupby(by='smoke')['ap_avg'].mean()


# In[40]:


def trend(df, column):
    h = df.groupby([column])['cardio'].sum() / d.groupby([column])['cardio'].count()
    sns.jointplot(h.index, h.values, kind="reg")


# In[42]:


trend(d, 'ap_avg')
trend(d, 'ap_delta')


# In[50]:


trend(d, 'imt*ap_avg')


# In[54]:


trend(d, '4*ch+gl')
trend(d, 'w-h+100')


# In[57]:


trend(d, '(4*ch+gl)*ap_avg')


# In[44]:


def predict(clf, fname='submission_baseline_2.csv'):
    train = prepare(pd.read_csv('data/train.csv.gz', compression='gzip', delimiter=';', index_col='id'))
    test = prepare(pd.read_csv('data/test.csv.gz', compression='gzip', delimiter=';', index_col='id'))

    clf.fit(get_X(train), get_Y(train))
    pred = clf.predict_proba(test)

    result = pd.DataFrame(index=test.index)
    result['cardio'] = pred[:, 1]
    result.to_csv(fname, header=None, index=None)


# In[37]:


predict(XGBClassifier(seed=42, nthread=4, n_estimators=1000, max_depth=6))


# In[45]:


from sklearn.ensemble import VotingClassifier

estimators = [('xgb_{}'.format(i), XGBClassifier(seed=i * 42, nthread=4, n_estimators=500, max_depth=(3+(i%3)))) for i in range(50)]

clf = VotingClassifier(estimators, voting='soft')
predict(clf, 'submission_ensemble_1.csv')