In [1]:
import pandas as pd
import numpy as np
from pandas.io.parsers import read_csv
%matplotlib inline

from matplotlib import pyplot as plt
import matplotlib as mpl

import scipy

import xgboost as xgb
In [2]:
bst = xgb.Booster({'nthread':4})
bst.load_model("0001.model")
In [3]:
dtest = xgb.DMatrix("test.buffer")
In [4]:
test_sid = np.load("test.sid.npy")
In [5]:
test_iid = np.load("test.iid.npy")
In [6]:
y_pred_xgb_prob = bst.predict(dtest)

y_pred_xgb = np.ones_like(y_pred_xgb_prob)
y_pred_xgb[:] = y_pred_xgb_prob

threshold = 0.047

y_pred_xgb[y_pred_xgb >= threshold] = 1
y_pred_xgb[y_pred_xgb < threshold] = 0
In [7]:
print np.count_nonzero(y_pred_xgb), 1.0 * np.count_nonzero(y_pred_xgb) / len(test_sid)
2724529 0.330174261563
In [8]:
df = pd.DataFrame(y_pred_xgb, index=test_sid).reset_index()
df.columns = ['sid', 'pred_xgb']
df['iid'] = test_iid
In [9]:
df.head()
Out[9]:
sid pred_xgb iid
0 5 1 214530776
1 5 1 214530776
2 5 1 214530776
3 10 0 214820942
4 10 0 214826810
In [10]:
guess_df = df[df['pred_xgb'] == 1].groupby('sid')['iid'].apply(lambda group: ','.join(str(k) for k in set(group.values)))
In [11]:
guess_df.reset_index().to_csv("xgb_0.1.guess", sep=";", index=False, header=False)