Tabular models

In [ ]:
from fastai.tabular import *

Tabular data should be in a Pandas DataFrame.

In [ ]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
In [ ]:
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]
In [ ]:
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)
In [ ]:
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(800,1000)))
                           .label_from_df(cols=dep_var)
                           .add_test(test)
                           .databunch())
In [ ]:
data.show_batch(rows=10)
workclass education marital-status occupation relationship race education-num_na age fnlwgt education-num target
Private HS-grad Never-married Sales Not-in-family White False -1.2158 1.1004 -0.4224 <50k
? HS-grad Widowed ? Not-in-family White False 1.8627 0.0976 -0.4224 <50k
Self-emp-not-inc HS-grad Never-married Craft-repair Own-child Black False 0.0303 0.2092 -0.4224 <50k
Private HS-grad Married-civ-spouse Protective-serv Husband White False 1.5695 -0.5938 -0.4224 <50k
Private HS-grad Married-civ-spouse Handlers-cleaners Husband White False -0.9959 -0.0318 -0.4224 <50k
Private 10th Married-civ-spouse Farming-fishing Wife White False -0.7027 0.6071 -1.5958 <50k
Private HS-grad Married-civ-spouse Machine-op-inspct Husband White False 0.1036 -0.0968 -0.4224 <50k
Private Some-college Married-civ-spouse Exec-managerial Own-child White False -0.7760 -0.6653 -0.0312 >=50k
State-gov Some-college Never-married Tech-support Own-child White False -0.8493 -1.4959 -0.0312 <50k
Private 11th Never-married Machine-op-inspct Not-in-family White False -1.0692 -0.9516 -1.2046 <50k
In [ ]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
In [ ]:
learn.fit(1, 1e-2)
Total time: 00:03

epoch train_loss valid_loss accuracy
1 0.354604 0.378520 0.820000

Inference

In [ ]:
row = df.iloc[0]
In [ ]:
learn.predict(row)
Out[ ]:
(Category >=50k, tensor(1), tensor([0.4402, 0.5598]))
In [ ]: