%reload_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.tabular import *
path=Path('data/tabular/')
path.ls()
[PosixPath('data/tabular/models'), PosixPath('data/tabular/train_bhu_ml.csv'), PosixPath('data/tabular/test_bhu_ml.csv'), PosixPath('data/tabular/Data_Test.xlsx'), PosixPath('data/tabular/ml2_test.csv'), PosixPath('data/tabular/Data_Train.xlsx'), PosixPath('data/tabular/ml2_train.csv')]
df=pd.read_csv(path/'train_bhu_ml.csv')
df.head()
ID | Tag | Reputation | Answers | Username | Views | Upvotes | |
---|---|---|---|---|---|---|---|
0 | 52664 | a | 3942.0 | 2.0 | 155623 | 7855.0 | 42.0 |
1 | 327662 | a | 26046.0 | 12.0 | 21781 | 55801.0 | 1175.0 |
2 | 468453 | c | 1358.0 | 4.0 | 56177 | 8067.0 | 60.0 |
3 | 96996 | a | 264.0 | 3.0 | 168793 | 27064.0 | 9.0 |
4 | 131465 | c | 4271.0 | 4.0 | 112223 | 13986.0 | 83.0 |
df.describe(include='all')
ID | Tag | Reputation | Answers | Username | Views | Upvotes | |
---|---|---|---|---|---|---|---|
count | 330045.000000 | 330045 | 3.300450e+05 | 330045.000000 | 330045.000000 | 3.300450e+05 | 330045.000000 |
unique | NaN | 10 | NaN | NaN | NaN | NaN | NaN |
top | NaN | c | NaN | NaN | NaN | NaN | NaN |
freq | NaN | 72458 | NaN | NaN | NaN | NaN | NaN |
mean | 235748.682789 | NaN | 7.773147e+03 | 3.917672 | 81442.888803 | 2.964507e+04 | 337.505358 |
std | 136039.418471 | NaN | 2.706141e+04 | 3.579515 | 49215.100730 | 8.095646e+04 | 3592.441135 |
min | 1.000000 | NaN | 0.000000e+00 | 0.000000 | 0.000000 | 9.000000e+00 | 0.000000 |
25% | 117909.000000 | NaN | 2.820000e+02 | 2.000000 | 39808.000000 | 2.594000e+03 | 8.000000 |
50% | 235699.000000 | NaN | 1.236000e+03 | 3.000000 | 79010.000000 | 8.954000e+03 | 28.000000 |
75% | 353620.000000 | NaN | 5.118000e+03 | 5.000000 | 122559.000000 | 2.687000e+04 | 107.000000 |
max | 471493.000000 | NaN | 1.042428e+06 | 76.000000 | 175738.000000 | 5.231058e+06 | 615278.000000 |
df.shape
(330045, 7)
df.drop('ID',axis=1,inplace=True)
df.head()
Tag | Reputation | Answers | Username | Views | Upvotes | |
---|---|---|---|---|---|---|
0 | a | 3942.0 | 2.0 | 155623 | 7855.0 | 42.0 |
1 | a | 26046.0 | 12.0 | 21781 | 55801.0 | 1175.0 |
2 | c | 1358.0 | 4.0 | 56177 | 8067.0 | 60.0 |
3 | a | 264.0 | 3.0 | 168793 | 27064.0 | 9.0 |
4 | c | 4271.0 | 4.0 | 112223 | 13986.0 | 83.0 |
cont,cat=cont_cat_split(df=df, max_card=2, dep_var='Upvotes')
cat
['Tag']
cont
['Reputation', 'Answers', 'Username', 'Views']
procs=[FillMissing,Categorify,Normalize]
dep='Upvotes'
test=TabularList.from_df(df.iloc[300000:].copy(), path=path, cat_names=cat ,cont_names=cont ,procs=procs)
data=(TabularList.from_df(df, path=path, cat_names=cat, cont_names=cont, procs=procs)
.split_by_idx(list(range(300000,330045)))
# .split_by_rand_pct(.3)
.label_from_df(cols=dep)
.add_test(test)
.databunch())
data.show_batch()
Tag | Reputation | Answers | Username | Views | target |
---|---|---|---|---|---|
j | -0.2733 | -0.5361 | 0.3411 | -0.0308 | 47.0 |
r | 0.2996 | 1.1434 | 1.1281 | 2.6274 | 5664.0 |
c | -0.2709 | -0.5361 | -0.6921 | 0.0243 | 30.0 |
o | -0.2618 | -0.8160 | 0.3635 | -0.2002 | 10.0 |
c | 3.7315 | 0.8635 | 1.1977 | 0.3762 | 6626.0 |
learn = tabular_learner(data, layers=[100,100,100,100,100,100,100,100,100],ps=[0.001],emb_drop=.04, metrics=root_mean_squared_error)
learn.lr_find()
epoch | train_loss | valid_loss | root_mean_squared_error | time |
---|
LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.
learn.recorder.plot(suggestion=True)
Failed to compute the gradients, there might not be enough points.
learn.fit_one_cycle(4,1.58E-04)
epoch | train_loss | valid_loss | root_mean_squared_error | time |
---|---|---|---|---|
0 | 9108702.000000 | 10813853.000000 | 1460.150269 | 00:55 |
1 | 15765001.000000 | 10616855.000000 | 1426.873047 | 00:56 |
2 | 10329053.000000 | 10603879.000000 | 1423.520264 | 00:56 |
3 | 4453435.000000 | 10414276.000000 | 1395.455688 | 00:55 |
learn.recorder.plot_losses()