#hide
#skip
! [ -e /content ] && pip install -Uqq fastai # upgrade fastai on colab
#default_exp tabular.data
#export
from fastai.torch_basics import *
from fastai.data.all import *
from fastai.tabular.core import *
#hide
from nbdev.showdoc import *
Helper functions to get data in a
DataLoaders
in the tabular application and higher classTabularDataLoaders
The main class to get your data ready for model training is TabularDataLoaders
and its factory methods. Checkout the tabular tutorial for examples of use.
#export
class TabularDataLoaders(DataLoaders):
"Basic wrapper around several `DataLoader`s with factory methods for tabular data"
@classmethod
@delegates(Tabular.dataloaders, but=["dl_type", "dl_kwargs"])
def from_df(cls, df, path='.', procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None,
valid_idx=None, **kwargs):
"Create from `df` in `path` using `procs`"
if cat_names is None: cat_names = []
if cont_names is None: cont_names = list(set(df)-set(L(cat_names))-set(L(y_names)))
splits = RandomSplitter()(df) if valid_idx is None else IndexSplitter(valid_idx)(df)
to = TabularPandas(df, procs, cat_names, cont_names, y_names, splits=splits, y_block=y_block)
return to.dataloaders(path=path, **kwargs)
@classmethod
def from_csv(cls, csv, skipinitialspace=True, **kwargs):
"Create from `csv` file in `path` using `procs`"
return cls.from_df(pd.read_csv(csv, skipinitialspace=skipinitialspace), **kwargs)
@delegates(TabDataLoader.__init__)
def test_dl(self, test_items, rm_type_tfms=None, process=True, inplace=False, **kwargs):
to = self.train_ds.new(test_items, inplace=inplace)
if process: to.process()
return self.valid.new(to, **kwargs)
Tabular._dbunch_type = TabularDataLoaders
TabularDataLoaders.from_csv = delegates(to=TabularDataLoaders.from_df)(TabularDataLoaders.from_csv)
This class should not be used directly, one of the factory methods should be preferred instead. All those factory methods accept as arguments:
cat_names
: the names of the categorical variablescont_names
: the names of the continuous variablesy_names
: the names of the dependent variablesy_block
: the TransformBlock
to use for the targetvalid_idx
: the indices to use for the validation set (defaults to a random split otherwise)bs
: the batch sizeval_bs
: the batch size for the validation DataLoader
(defaults to bs
)shuffle_train
: if we shuffle the training DataLoader
or notn
: overrides the numbers of elements in the datasetdevice
: the PyTorch device to use (defaults to default_device()
)show_doc(TabularDataLoaders.from_df)
TabularDataLoaders.from_df
[source]
TabularDataLoaders.from_df
(df
,path
='.'
,procs
=None
,cat_names
=None
,cont_names
=None
,y_names
=None
,y_block
=None
,valid_idx
=None
,bs
=64
,val_bs
=None
,shuffle_train
=True
,n
=None
,device
=None
)
Create from df
in path
using procs
Let's have a look on an example with the adult dataset:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv', skipinitialspace=True)
df.head()
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k |
1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k |
2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k |
3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k |
4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
dls = TabularDataLoaders.from_df(df, path, procs=procs, cat_names=cat_names, cont_names=cont_names,
y_names="salary", valid_idx=list(range(800,1000)), bs=64)
dls.show_batch()
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Private | 11th | Separated | Adm-clerical | Unmarried | Black | False | 55.0 | 213894.000562 | 7.0 | <50k |
1 | Private | HS-grad | Married-civ-spouse | Machine-op-inspct | Husband | White | False | 53.0 | 228500.001385 | 9.0 | >=50k |
2 | Private | HS-grad | Married-civ-spouse | Tech-support | Husband | White | False | 38.0 | 256864.000909 | 9.0 | >=50k |
3 | Private | Bachelors | Married-civ-spouse | Tech-support | Husband | White | False | 40.0 | 247879.997190 | 13.0 | >=50k |
4 | Private | Some-college | Divorced | Craft-repair | Not-in-family | White | False | 41.0 | 40151.001925 | 10.0 | >=50k |
5 | Private | HS-grad | Married-civ-spouse | Sales | Husband | White | False | 37.0 | 110713.001599 | 9.0 | >=50k |
6 | Private | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | False | 38.0 | 278924.000902 | 13.0 | >=50k |
7 | Self-emp-not-inc | 11th | Married-civ-spouse | Farming-fishing | Husband | White | False | 60.0 | 220341.999356 | 7.0 | <50k |
8 | ? | 9th | Never-married | ? | Not-in-family | White | False | 30.0 | 104965.001013 | 5.0 | <50k |
9 | ? | HS-grad | Never-married | ? | Not-in-family | White | False | 21.0 | 105311.997415 | 9.0 | <50k |
show_doc(TabularDataLoaders.from_csv)
TabularDataLoaders.from_csv
[source]
TabularDataLoaders.from_csv
(csv
,skipinitialspace
=True
,path
='.'
,procs
=None
,cat_names
=None
,cont_names
=None
,y_names
=None
,y_block
=None
,valid_idx
=None
,bs
=64
,val_bs
=None
,shuffle_train
=True
,n
=None
,device
=None
)
Create from csv
file in path
using procs
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
dls = TabularDataLoaders.from_csv(path/'adult.csv', path=path, procs=procs, cat_names=cat_names, cont_names=cont_names,
y_names="salary", valid_idx=list(range(800,1000)), bs=64)
External structured data files can contain unexpected spaces, e.g. after a comma. We can see that in the first row of adult.csv "49, Private,101320, ..."
. Often trimming is needed. Pandas has a convenient parameter skipinitialspace
that is exposed by TabularDataLoaders.from_csv()
. Otherwise category labels use for inference later such as workclass
:Private
will be categorized wrongly to 0 or "#na#"
if training label was read as " Private"
. Let's test this feature.
test_data = {
'age': [49],
'workclass': ['Private'],
'fnlwgt': [101320],
'education': ['Assoc-acdm'],
'education-num': [12.0],
'marital-status': ['Married-civ-spouse'],
'occupation': [''],
'relationship': ['Wife'],
'race': ['White'],
}
input = pd.DataFrame(test_data)
tdl = dls.test_dl(input)
test_ne(0, tdl.dataset.iloc[0]['workclass'])
#hide
from nbdev.export import notebook2script
notebook2script()
Converted 00_torch_core.ipynb. Converted 01_layers.ipynb. Converted 02_data.load.ipynb. Converted 03_data.core.ipynb. Converted 04_data.external.ipynb. Converted 05_data.transforms.ipynb. Converted 06_data.block.ipynb. Converted 07_vision.core.ipynb. Converted 08_vision.data.ipynb. Converted 09_vision.augment.ipynb. Converted 09b_vision.utils.ipynb. Converted 09c_vision.widgets.ipynb. Converted 10_tutorial.pets.ipynb. Converted 11_vision.models.xresnet.ipynb. Converted 12_optimizer.ipynb. Converted 13_callback.core.ipynb. Converted 13a_learner.ipynb. Converted 13b_metrics.ipynb. Converted 14_callback.schedule.ipynb. Converted 14a_callback.data.ipynb. Converted 15_callback.hook.ipynb. Converted 15a_vision.models.unet.ipynb. Converted 16_callback.progress.ipynb. Converted 17_callback.tracker.ipynb. Converted 18_callback.fp16.ipynb. Converted 18a_callback.training.ipynb. Converted 19_callback.mixup.ipynb. Converted 20_interpret.ipynb. Converted 20a_distributed.ipynb. Converted 21_vision.learner.ipynb. Converted 22_tutorial.imagenette.ipynb. Converted 23_tutorial.vision.ipynb. Converted 24_tutorial.siamese.ipynb. Converted 24_vision.gan.ipynb. Converted 30_text.core.ipynb. Converted 31_text.data.ipynb. Converted 32_text.models.awdlstm.ipynb. Converted 33_text.models.core.ipynb. Converted 34_callback.rnn.ipynb. Converted 35_tutorial.wikitext.ipynb. Converted 36_text.models.qrnn.ipynb. Converted 37_text.learner.ipynb. Converted 38_tutorial.text.ipynb. Converted 39_tutorial.transformers.ipynb. Converted 40_tabular.core.ipynb. Converted 41_tabular.data.ipynb. Converted 42_tabular.model.ipynb. Converted 43_tabular.learner.ipynb. Converted 44_tutorial.tabular.ipynb. Converted 45_collab.ipynb. Converted 46_tutorial.collab.ipynb. Converted 50_tutorial.datablock.ipynb. Converted 60_medical.imaging.ipynb. Converted 61_tutorial.medical_imaging.ipynb. Converted 65_medical.text.ipynb. Converted 70_callback.wandb.ipynb. Converted 71_callback.tensorboard.ipynb. Converted 72_callback.neptune.ipynb. Converted 73_callback.captum.ipynb. Converted 74_callback.cutmix.ipynb. Converted 97_test_utils.ipynb. Converted 99_pytorch_doc.ipynb. Converted dev-setup.ipynb. Converted index.ipynb. Converted quick_start.ipynb. Converted tutorial.ipynb.