from fastai.tabular import *
Tabular data should be in a Pandas DataFrame
.
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path / 'adult.csv')
dep_var = '>=50k'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_idx(list(range(800, 1000)))
.label_from_df(cols=dep_var)
.add_test(test, label=0)
.databunch())
data.show_batch(rows=10)
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | target |
---|---|---|---|---|---|---|---|---|---|---|
Private | 7th-8th | Married-civ-spouse | Machine-op-inspct | Husband | White | False | -0.2629 | -0.9428 | -2.3781 | 1 |
Self-emp-inc | HS-grad | Married-civ-spouse | Transport-moving | Husband | White | False | 2.0093 | -1.0762 | -0.4224 | 1 |
Self-emp-not-inc | Some-college | Never-married | Craft-repair | Not-in-family | White | False | -0.3362 | -0.3120 | -0.0312 | 0 |
Local-gov | HS-grad | Never-married | Craft-repair | Own-child | White | False | 0.5434 | -0.8287 | -0.4224 | 0 |
Private | Masters | Never-married | Tech-support | Other-relative | White | False | -0.9226 | -1.5147 | 1.5334 | 0 |
Private | 10th | Widowed | Transport-moving | Not-in-family | Black | False | 1.2030 | -0.7890 | -1.5958 | 0 |
State-gov | Bachelors | Never-married | Prof-specialty | Not-in-family | White | False | -1.1425 | 2.9637 | 1.1422 | 0 |
Private | Assoc-acdm | Divorced | Craft-repair | Not-in-family | White | False | 0.8365 | 0.1033 | 0.7511 | 0 |
Private | Some-college | Separated | Sales | Unmarried | Black | False | -0.6294 | 0.2097 | -0.0312 | 0 |
Private | HS-grad | Married-civ-spouse | Machine-op-inspct | Husband | White | False | -0.7760 | 0.0061 | -0.4224 | 0 |
learn = tabular_learner(data, layers=[200, 100], metrics=accuracy)
learn.fit(1, 1e-2)
epoch | train_loss | valid_loss | accuracy |
---|---|---|---|
1 | 0.361543 | 0.376106 | 0.815000 |
row = df.iloc[0]
learn.predict(row)
(Category 1, tensor(1), tensor([0.2809, 0.7191]))