from fastai import *
from fastai.tabular import *
from sklearn.model_selection import train_test_split
Tabular data should be in a Pandas DataFrame
.
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]
train, test = train_test_split(df, test_size=0.2)
print(len(train), len(test))
26048 6513
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_rand_pct(0.2)
.label_from_df(cols=dep_var)
.databunch())
data_test = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_none()
.label_from_df(cols=dep_var))
data_test.valid = data_test.train
data_test = data_test.databunch()
data.valid_dl = data_test.valid_dl
First I will show an example of what will not work
test = TabularList.from_df(df.iloc[700:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_idx(list(range(800,1000)))
.label_from_df(cols=dep_var)
.add_test(test)
.databunch())
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | 0.366413 | 0.387860 | 0.830000 | 00:05 |
data
TabularDataBunch; Train: LabelList (32361 items) x: TabularList workclass Private; education Assoc-acdm; marital-status Married-civ-spouse; occupation #na#; relationship Wife; race White; education-num_na False; age 0.7632; fnlwgt -0.8381; education-num 0.7511; ,workclass Private; education Masters; marital-status Divorced; occupation Exec-managerial; relationship Not-in-family; race White; education-num_na False; age 0.3968; fnlwgt 0.4458; education-num 1.5334; ,workclass Private; education HS-grad; marital-status Divorced; occupation #na#; relationship Unmarried; race Black; education-num_na True; age -0.0430; fnlwgt -0.8868; education-num -0.0312; ,workclass Self-emp-inc; education Prof-school; marital-status Married-civ-spouse; occupation Prof-specialty; relationship Husband; race Asian-Pac-Islander; education-num_na False; age -0.0430; fnlwgt -0.7288; education-num 1.9245; ,workclass Self-emp-not-inc; education 7th-8th; marital-status Married-civ-spouse; occupation Other-service; relationship Wife; race Black; education-num_na True; age 0.2502; fnlwgt -1.0185; education-num -0.0312; y: CategoryList >=50k,>=50k,<50k,>=50k,<50k Path: /root/.fastai/data/adult_sample; Valid: LabelList (200 items) x: TabularList workclass Private; education Some-college; marital-status Divorced; occupation Handlers-cleaners; relationship Unmarried; race White; education-num_na True; age 0.4701; fnlwgt -0.8793; education-num -0.0312; ,workclass Self-emp-inc; education Prof-school; marital-status Married-civ-spouse; occupation Prof-specialty; relationship Husband; race White; education-num_na True; age 0.5434; fnlwgt 0.0290; education-num -0.0312; ,workclass Private; education Assoc-voc; marital-status Divorced; occupation #na#; relationship Not-in-family; race White; education-num_na True; age -0.1896; fnlwgt 1.7704; education-num -0.0312; ,workclass Federal-gov; education Bachelors; marital-status Never-married; occupation Tech-support; relationship Not-in-family; race White; education-num_na True; age -0.9959; fnlwgt -1.3242; education-num -0.0312; ,workclass Private; education Bachelors; marital-status Married-civ-spouse; occupation #na#; relationship Husband; race White; education-num_na True; age -0.1163; fnlwgt -0.2389; education-num -0.0312; y: CategoryList <50k,>=50k,<50k,<50k,<50k Path: /root/.fastai/data/adult_sample; Test: LabelList (300 items) x: TabularList workclass Private; education HS-grad; marital-status Never-married; occupation #na#; relationship Own-child; race White; education-num_na True; age -0.6294; fnlwgt -1.2432; education-num -0.0312; ,workclass Federal-gov; education HS-grad; marital-status Married-civ-spouse; occupation Farming-fishing; relationship Husband; race White; education-num_na False; age 0.3235; fnlwgt 0.0586; education-num -0.4224; ,workclass Private; education Bachelors; marital-status Married-civ-spouse; occupation Exec-managerial; relationship Husband; race White; education-num_na False; age -0.1896; fnlwgt -1.4639; education-num 1.1422; ,workclass Private; education HS-grad; marital-status Married-civ-spouse; occupation #na#; relationship Husband; race White; education-num_na False; age -0.1163; fnlwgt -0.2014; education-num -0.4224; ,workclass Self-emp-inc; education HS-grad; marital-status Never-married; occupation #na#; relationship Not-in-family; race White; education-num_na True; age -0.2629; fnlwgt -0.4633; education-num -0.0312; y: EmptyLabelList ,,,, Path: /root/.fastai/data/adult_sample
learn.validate(dl=learn.data.train_dl)
[0.35220727, tensor(0.8369)]
learn.validate()
[0.3878597, tensor(0.8300)]
learn.validate(dl = learn.data.test_dl)
[0.2829722, tensor(0.8967)]
This looks very good right? But let's try doing it a different way to be sure... as this is above any research level results
I'm going to first use train_test_split to split our data into a 90/10 split
train, test = train_test_split(df, test_size=0.1)
len(train), len(test)
(29304, 3257)
Great, we have a 10% split. Now lets make our train and test databunches
data = (TabularList.from_df(train, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_rand_pct(0.2) # So we can get a 20% split into the validation
.label_from_df(cols=dep_var)
#.add_test(test) we are not using this though
.databunch())
data_test = (TabularList.from_df(test, path=path, cat_names=cat_names,
cont_names=cont_names, procs=procs,
processor = data.processor) # NOTICE THIS STEP, this is so the procs are all applied the exact same
.split_none() # we only want it
.label_from_df(cols=dep_var)
)
Here we do not databunch yet. This is due to the training dataloader is shuffled, which we don't want, and the last batch is dropped if not complete. How do we fix this? Set the valid dataloader to the train *before* databunching.
data_test
LabelLists; Train: LabelList (3257 items) x: TabularList workclass Private; education HS-grad; marital-status Married-civ-spouse; occupation Transport-moving; relationship Husband; race White; education-num_na False; age -0.4820; fnlwgt 2.1986; education-num -0.4231; ,workclass Private; education Doctorate; marital-status Married-civ-spouse; occupation Prof-specialty; relationship Husband; race White; education-num_na False; age 0.3985; fnlwgt 0.6453; education-num 2.3199; ,workclass Self-emp-not-inc; education Prof-school; marital-status Married-civ-spouse; occupation #na#; relationship Husband; race White; education-num_na False; age 0.3985; fnlwgt 0.0722; education-num 1.9281; ,workclass Private; education Bachelors; marital-status Married-civ-spouse; occupation Exec-managerial; relationship Husband; race White; education-num_na False; age 0.8387; fnlwgt -0.5257; education-num 1.1443; ,workclass Private; education 11th; marital-status Divorced; occupation Sales; relationship Own-child; race White; education-num_na False; age -0.9956; fnlwgt -1.1058; education-num -1.2069; y: CategoryList <50k,>=50k,>=50k,>=50k,<50k Path: /root/.fastai/data/adult_sample; Valid: LabelList (0 items) x: TabularList y: CategoryList Path: /root/.fastai/data/adult_sample; Test: None
data_test.valid = data_test.train
data_test = data_test.databunch()
Okay now let's look at the two
data
TabularDataBunch; Train: LabelList (23444 items) x: TabularList workclass Local-gov; education Some-college; marital-status Married-civ-spouse; occupation Exec-managerial; relationship Husband; race White; education-num_na False; age -0.7755; fnlwgt 0.8878; education-num -0.0313; ,workclass Private; education HS-grad; marital-status Never-married; occupation Craft-repair; relationship Not-in-family; race White; education-num_na False; age 0.1050; fnlwgt 0.1760; education-num -0.4231; ,workclass Private; education HS-grad; marital-status Married-civ-spouse; occupation Protective-serv; relationship Husband; race White; education-num_na False; age -0.6288; fnlwgt 0.1266; education-num -0.4231; ,workclass Local-gov; education Bachelors; marital-status Never-married; occupation Prof-specialty; relationship Not-in-family; race White; education-num_na False; age -0.9223; fnlwgt -0.2255; education-num 1.1443; ,workclass Private; education HS-grad; marital-status Married-civ-spouse; occupation Exec-managerial; relationship Husband; race White; education-num_na False; age 1.2056; fnlwgt -0.7925; education-num -0.4231; y: CategoryList <50k,<50k,>=50k,<50k,>=50k Path: /root/.fastai/data/adult_sample; Valid: LabelList (5860 items) x: TabularList workclass Private; education HS-grad; marital-status Married-civ-spouse; occupation Machine-op-inspct; relationship Wife; race White; education-num_na False; age -0.1885; fnlwgt -0.0029; education-num -0.4231; ,workclass State-gov; education Masters; marital-status Married-civ-spouse; occupation Exec-managerial; relationship Husband; race White; education-num_na False; age 1.0589; fnlwgt 0.0686; education-num 1.5362; ,workclass Private; education 9th; marital-status Never-married; occupation Other-service; relationship Own-child; race White; education-num_na False; age -0.8489; fnlwgt -1.4557; education-num -1.9906; ,workclass Private; education HS-grad; marital-status Divorced; occupation Other-service; relationship Not-in-family; race White; education-num_na False; age -0.7755; fnlwgt 0.0106; education-num -0.4231; ,workclass Private; education HS-grad; marital-status Never-married; occupation Transport-moving; relationship Unmarried; race White; education-num_na False; age -0.7755; fnlwgt 0.7598; education-num -0.4231; y: CategoryList <50k,>=50k,<50k,<50k,<50k Path: /root/.fastai/data/adult_sample; Test: None
data_test
TabularDataBunch; Train: LabelList (3257 items) x: TabularList workclass Private; education HS-grad; marital-status Married-civ-spouse; occupation Transport-moving; relationship Husband; race White; education-num_na False; age -0.4820; fnlwgt 2.1986; education-num -0.4231; ,workclass Private; education Doctorate; marital-status Married-civ-spouse; occupation Prof-specialty; relationship Husband; race White; education-num_na False; age 0.3985; fnlwgt 0.6453; education-num 2.3199; ,workclass Self-emp-not-inc; education Prof-school; marital-status Married-civ-spouse; occupation #na#; relationship Husband; race White; education-num_na False; age 0.3985; fnlwgt 0.0722; education-num 1.9281; ,workclass Private; education Bachelors; marital-status Married-civ-spouse; occupation Exec-managerial; relationship Husband; race White; education-num_na False; age 0.8387; fnlwgt -0.5257; education-num 1.1443; ,workclass Private; education 11th; marital-status Divorced; occupation Sales; relationship Own-child; race White; education-num_na False; age -0.9956; fnlwgt -1.1058; education-num -1.2069; y: CategoryList <50k,>=50k,>=50k,>=50k,<50k Path: /root/.fastai/data/adult_sample; Valid: LabelList (3257 items) x: TabularList workclass Private; education HS-grad; marital-status Married-civ-spouse; occupation Transport-moving; relationship Husband; race White; education-num_na False; age -0.4820; fnlwgt 2.1986; education-num -0.4231; ,workclass Private; education Doctorate; marital-status Married-civ-spouse; occupation Prof-specialty; relationship Husband; race White; education-num_na False; age 0.3985; fnlwgt 0.6453; education-num 2.3199; ,workclass Self-emp-not-inc; education Prof-school; marital-status Married-civ-spouse; occupation #na#; relationship Husband; race White; education-num_na False; age 0.3985; fnlwgt 0.0722; education-num 1.9281; ,workclass Private; education Bachelors; marital-status Married-civ-spouse; occupation Exec-managerial; relationship Husband; race White; education-num_na False; age 0.8387; fnlwgt -0.5257; education-num 1.1443; ,workclass Private; education 11th; marital-status Divorced; occupation Sales; relationship Own-child; race White; education-num_na False; age -0.9956; fnlwgt -1.1058; education-num -1.2069; y: CategoryList <50k,>=50k,>=50k,>=50k,<50k Path: /root/.fastai/data/adult_sample; Test: None
The numbers look right, lets do a quick train and try switching them again
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | 0.368615 | 0.359652 | 0.832082 | 00:05 |
learn.data.valid_dl = data_test.valid_dl
%time learn.validate()
CPU times: user 238 ms, sys: 100 ms, total: 338 ms Wall time: 486 ms
[0.35564667, tensor(0.8354)]
learn.data
TabularDataBunch; Train: LabelList (23444 items) x: TabularList workclass ?; education HS-grad; marital-status Never-married; occupation ?; relationship Own-child; race White; education-num_na False; age -1.3688; fnlwgt 1.3052; education-num -0.4249; ,workclass Local-gov; education 7th-8th; marital-status Married-civ-spouse; occupation Craft-repair; relationship Husband; race White; education-num_na False; age 1.4246; fnlwgt 0.7850; education-num -2.3731; ,workclass Private; education Some-college; marital-status Never-married; occupation Sales; relationship Own-child; race White; education-num_na False; age -1.2218; fnlwgt 1.8899; education-num -0.0352; ,workclass Private; education Assoc-acdm; marital-status Married-civ-spouse; occupation Adm-clerical; relationship Husband; race White; education-num_na False; age -0.0456; fnlwgt -0.0499; education-num 0.7441; ,workclass Private; education HS-grad; marital-status Divorced; occupation Handlers-cleaners; relationship Not-in-family; race White; education-num_na False; age 0.0279; fnlwgt -0.6546; education-num -0.4249; y: CategoryList <50k,<50k,<50k,<50k,<50k Path: /root/.fastai/data/adult_sample; Valid: LabelList (3257 items) x: TabularList workclass Local-gov; education HS-grad; marital-status Married-civ-spouse; occupation Transport-moving; relationship Husband; race Black; education-num_na False; age 0.9100; fnlwgt 1.2334; education-num -0.4249; ,workclass Private; education HS-grad; marital-status Never-married; occupation Exec-managerial; relationship Own-child; race White; education-num_na False; age -1.1483; fnlwgt 0.2891; education-num -0.4249; ,workclass State-gov; education HS-grad; marital-status Married-civ-spouse; occupation Adm-clerical; relationship Wife; race White; education-num_na False; age 0.6159; fnlwgt -0.8373; education-num -0.4249; ,workclass Federal-gov; education HS-grad; marital-status Divorced; occupation Adm-clerical; relationship Not-in-family; race White; education-num_na False; age 0.6895; fnlwgt 0.5501; education-num -0.4249; ,workclass Local-gov; education HS-grad; marital-status Never-married; occupation Protective-serv; relationship Own-child; race Black; education-num_na False; age -0.6337; fnlwgt 0.0258; education-num -0.4249; y: CategoryList <50k,<50k,>=50k,<50k,<50k Path: /root/.fastai/data/adult_sample; Test: None
As we can see, we no longer get that SUPER high test set accuracy, as it wasn't really validating it for us! Also we can match that the Valid LabelList got replaced with our own, as our test set had 3257 items. Also, this is much faster than doing learn.predict(). I'll show an example below for time
Below is a quick function using learn.predict where we will check to see if our predictions match our actual in the entire dataset
def CalculateAccuracy(learner, df, right):
for x in range(len(df)):
if str(df['salary'].iloc[x]) == str(learner.predict(df.iloc[x])[0]):
right +=1;
return right/(len(df))
%time acc = CalculateAccuracy(learn, test, 0)
CPU times: user 1min 22s, sys: 200 ms, total: 1min 22s Wall time: 1min 22s
acc
0.8326680994780473
Now let's use fastai's get_preds
function and do a comparison after we switch the above
data_test = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_none()
.label_from_df(cols=dep_var))
data_test.valid = data_test.train
data_test=data_test.databunch()
learn.data.valid_dl = data_test.valid_dl
%time learn.get_preds(ds_type=DatasetType.Valid)
CPU times: user 1.85 s, sys: 437 ms, total: 2.29 s Wall time: 3.45 s
[tensor([[0.4628, 0.5372], [0.4795, 0.5205], [0.9468, 0.0532], ..., [0.5167, 0.4833], [0.7234, 0.2766], [0.8070, 0.1930]]), tensor([1, 1, 0, ..., 1, 0, 0])]
Look at that time difference! 1:24 vs 3s. That is much faster as we are using the GPU here too.
valid_dl = data.valid_dl
test_dl = data_test.valid_dl
Now we can safely just replace one or the other and keep going.
To predict on test:
learn.data.valid_dl = test_dl
To revert back to our validation
learn.data.valid_dl = valid_dl
And now we can flip back in forth!