from fastai.text import *
from fastai.imports import *
from fastai.structured import *
from fastai.column_data import *
from torch.nn import functional as F
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
/home/dg/anaconda3/envs/fastai/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release. from numpy.core.umath_tests import inner1d
import gc
import pdb
def scale_vars(df, mapper, scale_col_exc):
warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
if mapper is None:
map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n]) and n not in scale_col_exc]
mapper = DataFrameMapper(map_f).fit(df)
df[mapper.transformed_names_] = mapper.transform(df)
return mapper
def proc_df2(df, y_fld = None, skip_flds=None, do_scale=True, scale_col_exc = None, na_dict=None,
preproc_fn=None, max_n_cat=10, subset=None, mapper=None):
if not skip_flds: skip_flds=[]
if subset: df = get_sample(df,subset)
df = df.copy()
if preproc_fn: preproc_fn(df)
if y_fld is not None:
y = df[y_fld].values
df.drop(skip_flds+[y_fld], axis=1, inplace=True)
if na_dict is None: na_dict = {}
for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
if do_scale: mapper = scale_vars(df, mapper, scale_col_exc)
for n,c in df.items(): numericalize(df, c, n, max_n_cat)
if y_fld is not None:
res = [pd.get_dummies(df, dummy_na=True), y, na_dict]
else:
res = [pd.get_dummies(df, dummy_na=True), na_dict]
if do_scale: res = res + [mapper]
return res
df_train = pd.read_csv('data/application_train.csv')
sk_id = set(df_train.SK_ID_CURR.values)
sk2tg = { k:v for k,v in df_train[['SK_ID_CURR', 'TARGET']].values}
cc_train = pd.read_csv('data/credit_card_balance.csv')
cc_train = cc_train[cc_train['SK_ID_CURR'].isin(sk_id)]
cc_train.drop(['SK_ID_PREV'], axis=1, inplace = True)
train_cats(cc_train)
cc_sk_id = set(cc_train['SK_ID_CURR'].values)
cc_y = [sk2tg[i] for i in cc_sk_id]
cc_sk_id = list(cc_sk_id)
x_train,x_valid, y_train, y_valid = train_test_split(cc_sk_id, cc_y, test_size=0.2, stratify = cc_y)
cc_train_proc, _, _ = proc_df2(cc_train, do_scale = True, scale_col_exc = ['SK_ID_CURR', 'MONTHS_BALANCE'])
def df_to_maxtrix(df, cols): return df[cols].values.astype(np.float32)
cc_train_proc = cc_train_proc.sort_values(by = ['SK_ID_CURR', 'MONTHS_BALANCE'])
cc_train = cc_train.sort_values(by = ['SK_ID_CURR', 'MONTHS_BALANCE'])
cols = [i for i in cc_train_proc.columns if i not in ['SK_ID_CURR','MONTHS_BALANCE']]
cc_train_group = cc_train_proc.groupby(['SK_ID_CURR']).apply(lambda x: df_to_maxtrix(x, cols))
cc_train.head(6)
SK_ID_CURR | MONTHS_BALANCE | AMT_BALANCE | AMT_CREDIT_LIMIT_ACTUAL | AMT_DRAWINGS_ATM_CURRENT | AMT_DRAWINGS_CURRENT | AMT_DRAWINGS_OTHER_CURRENT | AMT_DRAWINGS_POS_CURRENT | AMT_INST_MIN_REGULARITY | AMT_PAYMENT_CURRENT | ... | AMT_RECIVABLE | AMT_TOTAL_RECEIVABLE | CNT_DRAWINGS_ATM_CURRENT | CNT_DRAWINGS_CURRENT | CNT_DRAWINGS_OTHER_CURRENT | CNT_DRAWINGS_POS_CURRENT | CNT_INSTALMENT_MATURE_CUM | NAME_CONTRACT_STATUS | SK_DPD | SK_DPD_DEF | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1636141 | 100006 | -6 | 0.0 | 270000 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | 0.0 | 0.0 | NaN | 0 | NaN | NaN | 0.0 | Active | 0 | 0 |
655566 | 100006 | -5 | 0.0 | 270000 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | 0.0 | 0.0 | NaN | 0 | NaN | NaN | 0.0 | Active | 0 | 0 |
1399895 | 100006 | -4 | 0.0 | 270000 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | 0.0 | 0.0 | NaN | 0 | NaN | NaN | 0.0 | Active | 0 | 0 |
1347528 | 100006 | -3 | 0.0 | 270000 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | 0.0 | 0.0 | NaN | 0 | NaN | NaN | 0.0 | Active | 0 | 0 |
520387 | 100006 | -2 | 0.0 | 270000 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | 0.0 | 0.0 | NaN | 0 | NaN | NaN | 0.0 | Active | 0 | 0 |
584804 | 100006 | -1 | 0.0 | 270000 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | 0.0 | 0.0 | NaN | 0 | NaN | NaN | 0.0 | Active | 0 | 0 |
6 rows × 22 columns
class SortishSamplerRNN(Sampler):
def __init__(self, data_source, sk_id):
self.data_source, self.sk_id = data_source, sk_id
def __len__(self): return len(self.sk_id)
def __iter__(self):
d={}
for i in range(len(self.sk_id)): d.setdefault(len(self.data_source[self.sk_id[i]]), []).append(i)
result = [d[n] for n in sorted(d, reverse=True)]
result = [i for j in result for i in j]
return iter(result)
class ColumnarRNNDataset(Dataset):
def __init__(self, x, y):
self.x, self.y = x, np.array(y, dtype=np.float32)
def __len__(self): return len(self.y)
def __getitem__(self, idx): return [self.x[idx], self.y[idx]]
class ColumnarRNNModelData(ModelData):
@classmethod
def from_data_frames(cls, path, trn_x, val_x, trn_y, val_y,
tr_sampler = None, val_sampler = None,
bs=64, shuffle=False):
trn_ds = ColumnarRNNDataset(trn_x, trn_y)
val_ds = ColumnarRNNDataset(val_x, val_y)
return cls.get_dl(path, trn_ds, val_ds,
tr_sampler = tr_sampler, val_sampler = val_sampler,
bs = bs, shuffle=shuffle)
@classmethod
def get_dl(cls, path, trn_ds, val_ds,
tr_sampler = None, val_sampler = None,
bs = 64, shuffle=False):
trn_dl = DataLoader(trn_ds, bs, shuffle=shuffle, sampler = tr_sampler, num_workers=1)
val_dl = DataLoader(val_ds, bs, shuffle=shuffle, sampler = val_sampler, num_workers=1)
return cls(path, trn_dl, val_dl)
bs = 1024; PATH = ''
trn_samp = SortishSamplerRNN(cc_train_group, x_train)
val_samp = SortishSamplerRNN(cc_train_group, x_valid)
md = ColumnarRNNModelData.from_data_frames(PATH,
trn_x = x_train, val_x = x_valid,
trn_y = y_train, val_y = y_valid,
tr_sampler = trn_samp, val_sampler = val_samp,
bs=bs, shuffle=False)
class linear_block(nn.Module):
def __init__(self, ip_sz, op_sz, drop):
super().__init__()
self.op_sz = op_sz
self.ln = nn.Linear(ip_sz, op_sz)
kaiming_normal(self.ln.weight.data)
if drop is not None:
self.bn = nn.BatchNorm1d(op_sz)
self.drp = nn.Dropout(drop)
def forward(self, x): return self.drp(self.bn(F.relu(self.ln(x)))) if self.op_sz != 1 else self.ln(x)
class rnn_block(nn.Module):
def __init__(self, input_size = 36, hidden_size = 64, num_layers = 2, wdrop = 0.2, dropouto = 0.4):
super().__init__()
self.rnn = WeightDrop(nn.LSTM(input_size, hidden_size, num_layers, batch_first = True), wdrop)
self.dropouto = LockedDropout(dropouto)
def forward(self, cc_inp_pack):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
packed_output, _ = self.rnn(cc_inp_pack)
outp, _ = pad_packed_sequence(packed_output, batch_first=True)
outp = self.dropouto(outp)
return outp
class attention(nn.Module):
def __init__(self, dim, ip_sz = 64, op_sz = 36, drop = 0.2):
super().__init__()
self.w = self.rand_p(ip_sz, op_sz)
if dim == 1:
self.v = self.rand_p(op_sz)
else:
self.ln = linear_block(op_sz, op_sz, drop)
self.v = self.rand_p(op_sz, op_sz)
@staticmethod
def rand_p(*sz): return nn.Parameter(torch.randn(sz)/math.sqrt(sz[0]))
def forward(self, outp, mtrx = None):
we = outp@self.w
u = F.tanh(we)
at = F.softmax(u@self.v, 1)
xa = (at.unsqueeze(2)*outp).sum(1) if mtrx is None else self.ln((at*mtrx).sum(1))
return xa, at
class RNNModel(nn.Module):
def __init__(self, bs, cc_ref, cc_sk_id,
hd_sz, dropouti, wdrop, dropouto,
att_ip_drop, concat_drop,
ln_lyrs, ln_drop, ):
super().__init__()
self.bs, self.cc_sk_id, self.cc_ref, self.hd_sz = bs, cc_sk_id, cc_ref, hd_sz
self.dropouti = LockedDropout(dropouti)
self.rnn = rnn_block(hidden_size = hd_sz, wdrop = wdrop, dropouto = dropouto)
self.att_RNN = attention(dim = 1, ip_sz = self.hd_sz, op_sz = self.hd_sz)
self.att_ip = attention(dim = 2, ip_sz = self.hd_sz, drop = att_ip_drop)
self.concat_lyrs = linear_block(self.hd_sz*4+36, ln_lyrs[0], concat_drop)
self.lns = nn.ModuleList([linear_block(ln_lyrs[i], ln_lyrs[i+1], ln_drop[i]) for i in range(len(ln_lyrs)-1)])
def pad_from_inp(self, inp):
inp = [V(self.cc_ref[i]) for i in to_np(inp)]
lengths = [i.size()[0] for i in inp]
max_len = int(np.max(lengths))
pad_inp = torch.stack([nn.ConstantPad2d((0,0,0,max_len-i.size()[0]), 0)(i) for i in inp])
pad_inp_droped = self.dropouti(pad_inp)
return pack_padded_sequence(pad_inp_droped, lengths, batch_first=True), pad_inp
def pool(self, x, bs, is_max):
'''pooling layer, stole from fastai'''
f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
return f(x.permute(0,2,1), (1,)).view(bs,-1)
def forward(self, x_in):
#pdb.set_trace()
cc_inp, pad_inp = self.pad_from_inp(x_in)
outp = self.rnn(cc_inp)
att_rnn_out, at_rnn_m = self.att_RNN(outp)
att_inp_out, at_inp_out_m = self.att_ip(outp, pad_inp)
avgpool, mxpool = self.pool(outp, len(x_in), False), self.pool(outp, len(x_in), True)
x = self.concat_lyrs(torch.cat([outp[:,-1,:], avgpool, mxpool, att_rnn_out, att_inp_out], 1))
for lyer in self.lns: x = lyer(x)
return x.view(-1), at_rnn_m, at_inp_out_m
def bcewl(ip,tg): return F.binary_cross_entropy_with_logits(ip, tg, tg*0.6+0.2, size_average=False)
[dropouti, wdrop, dropouto] = np.array([0.25, 0.2, 0.15])*0.5
m = RNNModel(bs = bs, cc_ref = cc_train_group, cc_sk_id = cc_sk_id,
hd_sz = 128, dropouti = dropouti, wdrop = wdrop, dropouto = dropouto,
att_ip_drop = 0.1, concat_drop = 0.1,
ln_lyrs = [600, 400, 200, 100, 1], ln_drop = [0.1, 0.1, 0.1, 0.1])
RNN_learner = Learner.from_model_data(m, md)
RNN_learner.crit = bcewl
RNN_learner.lr_find()
HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))
57%|█████▋ | 39/68 [00:12<00:09, 3.08it/s, loss=384]
RNN_learner.sched.plot()
lr = 1e-4
RNN_learner.fit(lr, n_cycle = 3, cycle_len = 1)
HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))
epoch trn_loss val_loss 0 143.26762 145.529378 1 143.522918 145.716162 2 143.170848 145.176775
[array([145.17678])]
RNN_learner.save('RNN_learner')
it = iter(md.trn_dl)
x, y = next(it)
y_pred, a1, a2 = RNN_learner.predict_array(to_np(x))
df_pss = [i for i,j in enumerate(y) if j==1]
pos = 2
df_ps = df_pss[pos]
def plot_bar(df_ps, a1):
a1 = to_np(a1)[df_ps]
time_range = list(range(-len(a1)-1, -1))
plt.figure(figsize=(20,10))
plt.bar(time_range, a1)
def plot_heatmap(df_ps, a2):
plt.gcf().clear()
fig, ax = plt.subplots(figsize=(100,50))
a2 = to_np(a2)[df_ps].T
time_range = list(range(-len(a2.T)-1, -1))
ax = sns.heatmap(a2, xticklabels = time_range, yticklabels=cols, cmap="YlGnBu")
plt.show()
plot_bar(df_ps, a1)
plot_heatmap(df_ps, a2)
def plot_line(df_ps, col):
cc_train[cc_train['SK_ID_CURR'] == x[df_ps]][['MONTHS_BALANCE', col]].plot.line('MONTHS_BALANCE', col)
plot_line(df_ps, 'AMT_PAYMENT_CURRENT')
plot_line(df_ps, 'AMT_CREDIT_LIMIT_ACTUAL')
plot_line(df_ps, 'CNT_DRAWINGS_ATM_CURRENT')
pos = 60
df_ps = df_pss[pos]
plot_bar(df_ps, a1)
plot_heatmap(df_ps, a2)
plot_line(df_ps, 'AMT_INST_MIN_REGULARITY')
plot_line(df_ps, 'AMT_DRAWINGS_POS_CURRENT')
x, y = next(it)
y_pred, a1, a2 = RNN_learner.predict_array(to_np(x))
df_pss = [i for i,j in enumerate(y) if j==1]
pos = 30
df_ps = df_pss[pos]
plot_bar(df_ps, a1)
plot_heatmap(df_ps, a2)
plot_line(df_ps, 'AMT_INST_MIN_REGULARITY')
plot_line(df_ps, 'AMT_RECEIVABLE_PRINCIPAL')
plot_line(df_ps, 'AMT_RECIVABLE')