from fastai import *
from fastai.tabular import *
Tabular data should be in a Pandas DataFrame
.
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
dep_var = '>=50k'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_idx(list(range(800,1000)))
.label_from_df(cols=dep_var)
.add_test(test, label=0)
.databunch())
data.show_batch(rows=10)
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | target |
---|---|---|---|---|---|---|---|---|---|---|
Private | Prof-school | Married-civ-spouse | Prof-specialty | Husband | White | False | 0.1036 | 0.9224 | 1.9245 | 1 |
Self-emp-inc | Bachelors | Married-civ-spouse | Farming-fishing | Husband | White | False | 1.7161 | -1.2654 | 1.1422 | 1 |
Private | HS-grad | Never-married | Adm-clerical | Other-relative | Black | False | -0.7760 | 1.1905 | -0.4224 | 0 |
Private | 10th | Married-civ-spouse | Sales | Own-child | White | False | -1.5823 | -0.0268 | -1.5958 | 0 |
Private | Some-college | Never-married | Handlers-cleaners | Own-child | White | False | -1.3624 | 0.0284 | -0.0312 | 0 |
Private | Some-college | Married-civ-spouse | Prof-specialty | Husband | White | False | 0.3968 | 0.4367 | -0.0312 | 1 |
? | Some-college | Never-married | ? | Own-child | White | False | -1.4357 | -0.7295 | -0.0312 | 0 |
Self-emp-not-inc | 5th-6th | Married-civ-spouse | Sales | Husband | White | False | 0.6166 | -0.6503 | -2.7692 | 1 |
Private | Some-college | Married-civ-spouse | Sales | Husband | White | False | 1.5695 | -0.8876 | -0.0312 | 1 |
Local-gov | Some-college | Never-married | Handlers-cleaners | Own-child | White | False | -0.6294 | -1.5422 | -0.0312 | 0 |
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.model
TabularModel( (embeds): ModuleList( (0): Embedding(10, 6) (1): Embedding(17, 9) (2): Embedding(8, 5) (3): Embedding(16, 9) (4): Embedding(7, 4) (5): Embedding(6, 4) (6): Embedding(3, 2) ) (emb_drop): Dropout(p=0.0) (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (layers): Sequential( (0): Linear(in_features=42, out_features=200, bias=True) (1): ReLU(inplace) (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (3): Linear(in_features=200, out_features=100, bias=True) (4): ReLU(inplace) (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (6): Linear(in_features=100, out_features=2, bias=True) ) )
learn.fit(1, 1e-2)
Total time: 00:03 epoch train_loss valid_loss accuracy 1 0.362837 0.413169 0.785000 (00:03)
row = df.iloc[0]
learn.predict(row)
(1, tensor(0), tensor([0.6365, 0.3635]))