import pandas as pd
import numpy as np
from pandas.io.parsers import read_csv
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl
import scipy
import xgboost as xgb
bst = xgb.Booster({'nthread':4})
bst.load_model("0001.model")
dtest = xgb.DMatrix("test.buffer")
test_sid = np.load("test.sid.npy")
test_iid = np.load("test.iid.npy")
y_pred_xgb_prob = bst.predict(dtest)
y_pred_xgb = np.ones_like(y_pred_xgb_prob)
y_pred_xgb[:] = y_pred_xgb_prob
threshold = 0.047
y_pred_xgb[y_pred_xgb >= threshold] = 1
y_pred_xgb[y_pred_xgb < threshold] = 0
print np.count_nonzero(y_pred_xgb), 1.0 * np.count_nonzero(y_pred_xgb) / len(test_sid)
2724529 0.330174261563
df = pd.DataFrame(y_pred_xgb, index=test_sid).reset_index()
df.columns = ['sid', 'pred_xgb']
df['iid'] = test_iid
df.head()
sid | pred_xgb | iid | |
---|---|---|---|
0 | 5 | 1 | 214530776 |
1 | 5 | 1 | 214530776 |
2 | 5 | 1 | 214530776 |
3 | 10 | 0 | 214820942 |
4 | 10 | 0 | 214826810 |
guess_df = df[df['pred_xgb'] == 1].groupby('sid')['iid'].apply(lambda group: ','.join(str(k) for k in set(group.values)))
guess_df.reset_index().to_csv("xgb_0.1.guess", sep=";", index=False, header=False)