In [ ]:

#hide
#skip
! [ -e /content ] && pip install -Uqq fastai  # upgrade fastai on colab

In [ ]:

#default_exp tabular.core

In [ ]:

#export
from fastai.torch_basics import *
from fastai.data.all import *

In [ ]:

#hide
from nbdev.showdoc import *

In [ ]:

#export
pd.set_option('mode.chained_assignment','raise')

Tabular core¶

Basic function to preprocess tabular data before assembling it in a DataLoaders.

Initial preprocessing¶

In [ ]:

#export
def make_date(df, date_field):
    "Make sure `df[date_field]` is of the right date type."
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)

In [ ]:

df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
make_date(df, 'date')
test_eq(df['date'].dtype, np.dtype('datetime64[ns]'))

In [ ]:

#export
def add_datepart(df, field_name, prefix=None, drop=True, time=False):
    "Helper function that adds columns relevant to a date in the column `field_name` of `df`."
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    # Pandas removed `dt.week` in v1.1.10
    week = field.dt.isocalendar().week.astype(field.dt.day.dtype) if hasattr(field.dt, 'isocalendar') else field.dt.week
    for n in attr: df[prefix + n] = getattr(field.dt, n.lower()) if n != 'Week' else week
    mask = ~field.isna()
    df[prefix + 'Elapsed'] = np.where(mask,field.values.astype(np.int64) // 10 ** 9,np.nan)
    if drop: df.drop(field_name, axis=1, inplace=True)
    return df

For example if we have a series of dates we can then generate features such as Year, Month, Day, Dayofweek, Is_month_start, etc as shown below:

In [ ]:

df = pd.DataFrame({'date': ['2019-12-04', None, '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
df.head()

Out[ ]:

	Year	Month	Week	Day	Dayofweek	Dayofyear	Is_month_end	Is_month_start	Is_quarter_end	Is_quarter_start	Is_year_end	Is_year_start	Elapsed
0	2019.0	12.0	49.0	4.0	2.0	338.0	False	False	False	False	False	False	1.575418e+09
1	NaN	NaN	NaN	NaN	NaN	NaN	False	False	False	False	False	False	NaN
2	2019.0	11.0	46.0	15.0	4.0	319.0	False	False	False	False	False	False	1.573776e+09
3	2019.0	10.0	43.0	24.0	3.0	297.0	False	False	False	False	False	False	1.571875e+09

In [ ]:

#hide
test_eq(df.columns, ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])
test_eq(df[df.Elapsed.isna()].shape,(1, 13))

# Test that week dtype is consistent with other datepart fields
test_eq(df['Year'].dtype, df['Week'].dtype)

test_eq(pd.api.types.is_numeric_dtype(df['Elapsed']), True)

In [ ]:

#hide
df = pd.DataFrame({'f1': [1.],'f2': [2.],'f3': [3.],'f4': [4.],'date':['2019-12-04']})
df = add_datepart(df, 'date')
df.head()

Out[ ]:

	f1	f2	f3	f4	Year	Month	Week	Day	Dayofweek	Dayofyear	Is_month_end	Is_month_start	Is_quarter_end	Is_quarter_start	Is_year_end	Is_year_start	Elapsed
0	1.0	2.0	3.0	4.0	2019	12	49	4	2	338	False	False	False	False	False	False	1.575418e+09

In [ ]:

#hide
# Test Order of columns when date isn't in first position
test_eq(df.columns, ['f1', 'f2', 'f3', 'f4', 'Year', 'Month', 'Week', 'Day', 
            'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])

# Test that week dtype is consistent with other datepart fields
test_eq(df['Year'].dtype, df['Week'].dtype)

In [ ]:

#export
def _get_elapsed(df,field_names, date_field, base_field, prefix):
    for f in field_names:
        day1 = np.timedelta64(1, 'D')
        last_date,last_base,res = np.datetime64(),None,[]
        for b,v,d in zip(df[base_field].values, df[f].values, df[date_field].values):
            if last_base is None or b != last_base:
                last_date,last_base = np.datetime64(),b
            if v: last_date = d
            res.append(((d-last_date).astype('timedelta64[D]') / day1))
        df[prefix + f] = res
    return df

In [ ]:

#export
def add_elapsed_times(df, field_names, date_field, base_field):
    "Add in `df` for each event in `field_names` the elapsed time according to `date_field` grouped by `base_field`"
    field_names = list(L(field_names))
    #Make sure date_field is a date and base_field a bool
    df[field_names] = df[field_names].astype('bool')
    make_date(df, date_field)

    work_df = df[field_names + [date_field, base_field]]
    work_df = work_df.sort_values([base_field, date_field])
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'After')
    work_df = work_df.sort_values([base_field, date_field], ascending=[True, False])
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'Before')

    for a in ['After' + f for f in field_names] + ['Before' + f for f in field_names]:
        work_df[a] = work_df[a].fillna(0).astype(int)

    for a,s in zip([True, False], ['_bw', '_fw']):
        work_df = work_df.set_index(date_field)
        tmp = (work_df[[base_field] + field_names].sort_index(ascending=a)
                      .groupby(base_field).rolling(7, min_periods=1).sum())
        tmp.drop(base_field,1,inplace=True)
        tmp.reset_index(inplace=True)
        work_df.reset_index(inplace=True)
        work_df = work_df.merge(tmp, 'left', [date_field, base_field], suffixes=['', s])
    work_df.drop(field_names,1,inplace=True)
    return df.merge(work_df, 'left', [date_field, base_field])

In [ ]:

df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
                   'event': [False, True, False, True], 'base': [1,1,2,2]})
df = add_elapsed_times(df, ['event'], 'date', 'base')
df.head()

Out[ ]:

	date	event	base	Afterevent	event_bw	event_fw
0	2019-12-04	False	1	5	1.0	0.0
1	2019-11-29	True	1	0	1.0	1.0
2	2019-11-15	False	2	22	1.0	0.0
3	2019-10-24	True	2	0	1.0	1.0

In [ ]:

#export
def cont_cat_split(df, max_card=20, dep_var=None):
    "Helper function that returns column names of cont and cat variables from given `df`."
    cont_names, cat_names = [], []
    for label in df:
        if label in L(dep_var): continue
        if ((pd.api.types.is_integer_dtype(df[label].dtype) and
            df[label].unique().shape[0] > max_card) or
            pd.api.types.is_float_dtype(df[label].dtype)):
            cont_names.append(label)
        else: cat_names.append(label)
    return cont_names, cat_names

This function works by determining if a column is continuous or categorical based on the cardinality of its values. If it is above the max_card parameter (or a float datatype) then it will be added to the cont_names else cat_names. An example is below:

In [ ]:

# Example with simple numpy types
df = pd.DataFrame({'cat1': [1, 2, 3, 4], 'cont1': [1., 2., 3., 2.], 'cat2': ['a', 'b', 'b', 'a'], 
                   'i8': pd.Series([1, 2, 3, 4], dtype='int8'), 
                   'u8': pd.Series([1, 2, 3, 4], dtype='uint8'), 
                   'f16': pd.Series([1, 2, 3, 4], dtype='float16'),
                   'y1': [1, 0, 1, 0], 'y2': [2, 1, 1, 0]})
cont_names, cat_names = cont_cat_split(df)

In [ ]:

#hide_input
print(f'cont_names: {cont_names}\ncat_names: {cat_names}`')

cont_names: ['cont1', 'f16']
cat_names: ['cat1', 'cat2', 'i8', 'u8', 'y1', 'y2']`

In [ ]:

#hide
# Test all columns
cont, cat = cont_cat_split(df)
test_eq((cont, cat), (['cont1', 'f16'], ['cat1', 'cat2', 'i8', 'u8', 'y1', 'y2']))

# Test exclusion of dependent variable
cont, cat = cont_cat_split(df, dep_var='y1')
test_eq((cont, cat), (['cont1', 'f16'], ['cat1', 'cat2', 'i8', 'u8', 'y2']))

# Test exclusion of multi-label dependent variables
cont, cat = cont_cat_split(df, dep_var=['y1', 'y2'])
test_eq((cont, cat), (['cont1', 'f16'], ['cat1', 'cat2', 'i8', 'u8']))

# Test maximal cardinality bound for int variable
cont, cat = cont_cat_split(df, max_card=3)
test_eq((cont, cat), (['cat1', 'cont1', 'i8', 'u8', 'f16'], ['cat2', 'y1', 'y2']))
cont, cat = cont_cat_split(df, max_card=2)
test_eq((cont, cat), (['cat1', 'cont1', 'i8', 'u8', 'f16', 'y2'], ['cat2', 'y1']))
cont, cat = cont_cat_split(df, max_card=1)
test_eq((cont, cat), (['cat1', 'cont1', 'i8', 'u8', 'f16', 'y1', 'y2'], ['cat2']))

In [ ]:

# Example with pandas types and generated columns
df = pd.DataFrame({'cat1': pd.Series(['l','xs','xl','s'], dtype='category'),
                    'ui32': pd.Series([1, 2, 3, 4], dtype='UInt32'),
                    'i64': pd.Series([1, 2, 3, 4], dtype='Int64'),
                    'f16': pd.Series([1, 2, 3, 4], dtype='Float64'),
                    'd1_date': ['2021-02-09', None, '2020-05-12', '2020-08-14'],
                    })
df = add_datepart(df, 'd1_date', drop=False)
df['cat1'].cat.set_categories(['xl','l','m','s','xs'], ordered=True, inplace=True)
cont_names, cat_names = cont_cat_split(df, max_card=0)

In [ ]:

#hide_input
print(f'cont_names: {cont_names}\ncat_names: {cat_names}')

cont_names: ['ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed']
cat_names: ['cat1', 'd1_date', 'd1_Is_month_end', 'd1_Is_month_start', 'd1_Is_quarter_end', 'd1_Is_quarter_start', 'd1_Is_year_end', 'd1_Is_year_start']

In [ ]:

#hide
cont, cat = cont_cat_split(df, max_card=0)
test_eq((cont, cat), (
    ['ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed'], 
    ['cat1', 'd1_date', 'd1_Is_month_end', 'd1_Is_month_start', 'd1_Is_quarter_end', 'd1_Is_quarter_start', 'd1_Is_year_end', 'd1_Is_year_start']
    ))

In [ ]:

#export
def df_shrink_dtypes(df, skip=[], obj2cat=True, int2uint=False):
    "Return any possible smaller data types for DataFrame columns. Allows `object`->`category`, `int`->`uint`, and exclusion."

    # 1: Build column filter and typemap
    excl_types, skip = {'category','datetime64[ns]','bool'}, set(skip)

    typemap = {'int'   : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.int8, np.int16, np.int32, np.int64)],
               'uint'  : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.uint8, np.uint16, np.uint32, np.uint64)],
               'float' : [(np.dtype(x), np.finfo(x).min, np.finfo(x).max) for x in (np.float32, np.float64, np.longdouble)]
              }
    if obj2cat: typemap['object'] = 'category'  # User wants to categorify dtype('Object'), which may not always save space
    else:       excl_types.add('object')

    new_dtypes = {}
    exclude = lambda dt: dt[1].name not in excl_types and dt[0] not in skip

    for c, old_t in filter(exclude, df.dtypes.items()):
        t = next((v for k,v in typemap.items() if old_t.name.startswith(k)), None)

        if isinstance(t, list): # Find the smallest type that fits
            if int2uint and t==typemap['int'] and df[c].min() >= 0: t=typemap['uint']
            new_t = next((r[0] for r in t if r[1]<=df[c].min() and r[2]>=df[c].max()), None)
            if new_t and new_t == old_t: new_t = None
        else: new_t = t if isinstance(t, str) else None

        if new_t: new_dtypes[c] = new_t
    return new_dtypes

In [ ]:

show_doc(df_shrink_dtypes, title_level=3)

`df_shrink_dtypes`[source]

df_shrink_dtypes(df, skip=[], obj2cat=True, int2uint=False)

Return any possible smaller data types for DataFrame columns. Allows object->category, int->uint, and exclusion.

For example we will make a sample DataFrame with int, float, bool, and object datatypes:

In [ ]:

df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'e': [True, False, True],
                   'date':['2019-12-04','2019-11-29','2019-11-15',]})
df.dtypes

Out[ ]:

i         int64
f       float64
e          bool
date     object
dtype: object

We can then call df_shrink_dtypes to find the smallest possible datatype that can support the data:

In [ ]:

dt = df_shrink_dtypes(df)
dt

Out[ ]:

{'i': dtype('int8'), 'f': dtype('float32'), 'date': 'category'}

In [ ]:

#hide
test_eq(df['i'].dtype, 'int64')
test_eq(dt['i'], 'int8')

test_eq(df['f'].dtype, 'float64')
test_eq(dt['f'], 'float32')

# Default ignore 'object' and 'boolean' columns
test_eq(df['date'].dtype, 'object')
test_eq(dt['date'], 'category')

# Test categorifying 'object' type
dt2 = df_shrink_dtypes(df, obj2cat=False)
test_eq('date' not in dt2, True)

In [ ]:

#export
def df_shrink(df, skip=[], obj2cat=True, int2uint=False):
    "Reduce DataFrame memory usage, by casting to smaller types returned by `df_shrink_dtypes()`."
    dt = df_shrink_dtypes(df, skip, obj2cat=obj2cat, int2uint=int2uint)
    return df.astype(dt)

In [ ]:

show_doc(df_shrink, title_level=3)

`df_shrink`[source]

df_shrink(df, skip=[], obj2cat=True, int2uint=False)

Reduce DataFrame memory usage, by casting to smaller types returned by df_shrink_dtypes().

df_shrink(df) attempts to make a DataFrame uses less memory, by fit numeric columns into smallest datatypes. In addition:

boolean, category, datetime64[ns] dtype columns are ignored.
'object' type columns are categorified, which can save a lot of memory in large dataset. It can be turned off by obj2cat=False.
int2uint=True, to fit int types to uint types, if all data in the column is >= 0.
columns can be excluded by name using excl_cols=['col1','col2'].

To get only new column data types without actually casting a DataFrame, use df_shrink_dtypes() with all the same parameters for df_shrink().

In [ ]:

df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'u':[0, 10,254],
                  'date':['2019-12-04','2019-11-29','2019-11-15']})
df2 = df_shrink(df, skip=['date'])

Let's compare the two:

In [ ]:

df.dtypes

Out[ ]:

i         int64
f       float64
u         int64
date     object
dtype: object

In [ ]:

df2.dtypes

Out[ ]:

i          int8
f       float32
u         int16
date     object
dtype: object

We can see that the datatypes changed, and even further we can look at their relative memory usages:

In [ ]:

#hide_input
print(f'Initial Dataframe: {df.memory_usage().sum()} bytes')
print(f'Reduced Dataframe: {df2.memory_usage().sum()} bytes')

Initial Dataframe: 176 bytes
Reduced Dataframe: 125 bytes

In [ ]:

#hide
test_eq(df['i'].dtype=='int64' and df2['i'].dtype=='int8', True)
test_eq(df['f'].dtype=='float64' and df2['f'].dtype=='float32', True)
test_eq(df['u'].dtype=='int64' and df2['u'].dtype=='int16', True)
test_eq(df2['date'].dtype, 'object')

test_eq(df2.memory_usage().sum() < df.memory_usage().sum(), True)

# Test int => uint (when col.min() >= 0)
df3 = df_shrink(df, int2uint=True)
test_eq(df3['u'].dtype, 'uint8')  # int64 -> uint8 instead of int16

# Test excluding columns
df4 = df_shrink(df, skip=['i','u'])
test_eq(df['i'].dtype, df4['i'].dtype)
test_eq(df4['u'].dtype, 'int64')

Here's another example using the ADULT_SAMPLE dataset:

In [ ]:

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
new_df = df_shrink(df, int2uint=True)

In [ ]:

#hide_input
print(f'Initial Dataframe: {df.memory_usage().sum() / 1000000} megabytes')
print(f'Reduced Dataframe: {new_df.memory_usage().sum() / 1000000} megabytes')

Initial Dataframe: 3.9074 megabytes
Reduced Dataframe: 0.818617 megabytes

We reduced the overall memory used by 79%!

Tabular -¶

In [ ]:

#export
class _TabIloc:
    "Get/set rows by iloc and cols by name"
    def __init__(self,to): self.to = to
    def __getitem__(self, idxs):
        df = self.to.items
        if isinstance(idxs,tuple):
            rows,cols = idxs
            cols = df.columns.isin(cols) if is_listy(cols) else df.columns.get_loc(cols)
        else: rows,cols = idxs,slice(None)
        return self.to.new(df.iloc[rows, cols])

In [ ]:

#export
class Tabular(CollBase, GetAttr, FilteredBase):
    "A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
    _default,with_cont='procs',True
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None, splits=None,
                 do_setup=True, device=None, inplace=False, reduce_memory=True):
        if inplace and splits is not None and pd.options.mode.chained_assignment is not None:
            warn("Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")
        if not inplace: df = df.copy()
        if reduce_memory: df = df_shrink(df)
        if splits is not None: df = df.iloc[sum(splits, [])]
        self.dataloaders = delegates(self._dl_type.__init__)(self.dataloaders)
        super().__init__(df)

        self.y_names,self.device = L(y_names),device
        if y_block is None and self.y_names:
            # Make ys categorical if they're not numeric
            ys = df[self.y_names]
            if len(ys.select_dtypes(include='number').columns)!=len(ys.columns): y_block = CategoryBlock()
            else: y_block = RegressionBlock()
        if y_block is not None and do_setup:
            if callable(y_block): y_block = y_block()
            procs = L(procs) + y_block.type_tfms
        self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
        self.split = len(df) if splits is None else len(splits[0])
        if do_setup: self.setup()

    def new(self, df, inplace=False):
        return type(self)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(), inplace=inplace,
                          **attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))

    def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
    def copy(self): self.items = self.items.copy(); return self
    def decode(self): return self.procs.decode(self)
    def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
    def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
    def setup(self): self.procs.setup(self)
    def process(self): self.procs(self)
    def loc(self): return self.items.loc
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.items[self.y_names]
    def x_names (self): return self.cat_names + self.cont_names
    def n_subsets(self): return 2
    def y(self): return self[self.y_names[0]]
    def new_empty(self): return self.new(pd.DataFrame({}, columns=self.items.columns))
    def to_device(self, d=None):
        self.device = d
        return self

    def all_col_names (self):
        ys = [n for n in self.y_names if n in self.items.columns]
        return self.x_names + self.y_names if len(ys) == len(self.y_names) else self.x_names

properties(Tabular,'loc','iloc','targ','all_col_names','n_subsets','x_names','y')

df: A DataFrame of your data
cat_names: Your categorical x variables
cont_names: Your continuous x variables
y_names: Your dependent y variables
- Note: Mixed y's such as Regression and Classification is not currently supported, however multiple regression or classification outputs is
y_block: How to sub-categorize the type of y_names (CategoryBlock or RegressionBlock)
splits: How to split your data
do_setup: A parameter for if Tabular will run the data through the procs upon initialization
device: cuda or cpu
inplace: If True, Tabular will not keep a separate copy of your original DataFrame in memory. You should ensure pd.options.mode.chained_assignment is None before setting this
reduce_memory: fastai will attempt to reduce the overall memory usage by the inputted DataFrame with df_shrink

In [ ]:

#export
class TabularPandas(Tabular):
    "A `Tabular` object with transforms"
    def transform(self, cols, f, all_col=True):
        if not all_col: cols = [c for c in cols if c in self.items.columns]
        if len(cols) > 0: self[cols] = self[cols].transform(f)

In [ ]:

#export
def _add_prop(cls, nm):
    @property
    def f(o): return o[list(getattr(o,nm+'_names'))]
    @f.setter
    def fset(o, v): o[getattr(o,nm+'_names')] = v
    setattr(cls, nm+'s', f)
    setattr(cls, nm+'s', fset)

_add_prop(Tabular, 'cat')
_add_prop(Tabular, 'cont')
_add_prop(Tabular, 'y')
_add_prop(Tabular, 'x')
_add_prop(Tabular, 'all_col')

In [ ]:

#hide
df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
to = TabularPandas(df, cat_names='a')
t = pickle.loads(pickle.dumps(to))
test_eq(t.items,to.items)
test_eq(to.all_cols,to[['a']])

In [ ]:

#hide
import gc
def _count_objs(o):
    "Counts number of instanes of class `o`"
    objs = gc.get_objects()
    return len([x for x in objs if isinstance(x, pd.DataFrame)])

df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
df_b = pd.DataFrame({'a':[1,2,0,0,2], 'b':[1,0,3,0,1]})

to = TabularPandas(df, cat_names='a', inplace=True)

_init_count = _count_objs(pd.DataFrame)
to_new = to.new(df_b, inplace=True)
test_eq(_init_count, _count_objs(pd.DataFrame))

In [ ]:

#export
class TabularProc(InplaceTransform):
    "Base class to write a non-lazy tabular processor for dataframes"
    def setup(self, items=None, train_setup=False): #TODO: properly deal with train_setup
        super().setup(getattr(items,'train',items), train_setup=False)
        # Procs are called as soon as data is available
        return self(items.items if isinstance(items,Datasets) else items)

    @property
    def name(self): return f"{super().name} -- {getattr(self,'__stored_args__',{})}"

These transforms are applied as soon as the data is available rather than as data is called from the DataLoader

In [ ]:

#export
def _apply_cats (voc, add, c):
    if not is_categorical_dtype(c):
        return pd.Categorical(c, categories=voc[c.name][add:]).codes+add
    return c.cat.codes+add #if is_categorical_dtype(c) else c.map(voc[c.name].o2i)
def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))

In [ ]:

#export
class Categorify(TabularProc):
    "Transform the categorical variables to something similar to `pd.Categorical`"
    order = 1
    def setups(self, to):
        store_attr(classes={n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names}, but='to')

    def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
    def decodes(self, to): to.transform(to.cat_names, partial(_decode_cats, self.classes))
    def __getitem__(self,k): return self.classes[k]

In [ ]:

#exporti
@Categorize
def setups(self, to:Tabular):
    if len(to.y_names) > 0:
        if self.vocab is None:
            self.vocab = CategoryMap(getattr(to, 'train', to).iloc[:,to.y_names[0]].items, strict=True)
        else:
            self.vocab = CategoryMap(self.vocab, sort=False, add_na=self.add_na)
        self.c = len(self.vocab)
    return self(to)

@Categorize
def encodes(self, to:Tabular):
    to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0), all_col=False)
    return to

@Categorize
def decodes(self, to:Tabular):
    to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}), all_col=False)
    return to

In [ ]:

show_doc(Categorify, title_level=3)

`class` `Categorify`[source]

Categorify(enc=None, dec=None, split_idx=None, order=None) :: TabularProc

Transform the categorical variables to something similar to pd.Categorical

While visually in the DataFrame you will not see a change, the classes are stored in to.procs.categorify as we can see below on a dummy DataFrame:

In [ ]:

df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, Categorify, 'a')
to.show()

	a
0	0
1	1
2	2
3	0
4	2

Each column's unique values are stored in a dictionary of column:[values]:

In [ ]:

cat = to.procs.categorify
cat.classes

Out[ ]:

{'a': ['#na#', 0, 1, 2]}

In [ ]:

#hide
def test_series(a,b): return test_eq(list(a), b)
test_series(cat['a'], ['#na#',0,1,2])
test_series(to['a'], [1,2,3,1,3])

In [ ]:

#hide
df1 = pd.DataFrame({'a':[1,0,3,-1,2]})
to1 = to.new(df1)
to1.process()
#Values that weren't in the training df are sent to 0 (na)
test_series(to1['a'], [2,1,0,0,3])
to2 = cat.decode(to1)
test_series(to2['a'], [1,0,'#na#','#na#',2])

In [ ]:

#hide
#test with splits
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2]})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]])
test_series(cat['a'], ['#na#',0,1,2])
test_series(to['a'], [1,2,3,0,3])

In [ ]:

#hide
df = pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_series(cat['a'], ['#na#','H','M','L'])
test_series(to.items.a, [2,1,3,2])
to2 = cat.decode(to)
test_series(to2['a'], ['M','H','L','M'])

In [ ]:

#hide
#test with targets
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_series(to.vocab, ['a', 'b'])
test_series(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_series(to2['b'], ['a', 'b', 'a', 'b', 'b'])

In [ ]:

#hide
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_series(to.vocab, ['a', 'b'])
test_series(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_series(to2['b'], ['a', 'b', 'a', 'b', 'b'])

In [ ]:

#hide
#test with targets and train
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'c', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_series(to.vocab, ['a', 'b'])

In [ ]:

#hide
#test to ensure no copies of the dataframe are stored
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, cat, cont_names='a', splits=[[0,1,2],[3,4]])
test_eq(hasattr(to.categorify, 'to'), False)

In [ ]:

#exporti
@Normalize
def setups(self, to:Tabular):
    store_attr(but='to', means=dict(getattr(to, 'train', to).conts.mean()),
               stds=dict(getattr(to, 'train', to).conts.std(ddof=0)+1e-7))
    return self(to)

@Normalize
def encodes(self, to:Tabular):
    to.conts = (to.conts-self.means) / self.stds
    return to

@Normalize
def decodes(self, to:Tabular):
    to.conts = (to.conts*self.stds ) + self.means
    return to

In [ ]:

#hide
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a')
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (x-m)/s)

In [ ]:

#hide
df1 = pd.DataFrame({'a':[5,6,7]})
to1 = to.new(df1)
to1.process()
test_close(to1['a'].values, (np.array([5,6,7])-m)/s)
to2 = norm.decode(to1)
test_close(to2['a'].values, [5,6,7])

In [ ]:

#hide
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (np.array([0,1,2,3,4])-m)/s)

In [ ]:

#hide
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
test_eq(hasattr(to.procs.normalize, 'to'), False)

In [ ]:

#export
class FillStrategy:
    "Namespace containing the various filling strategies."
    def median  (c,fill): return c.median()
    def constant(c,fill): return fill
    def mode    (c,fill): return c.dropna().value_counts().idxmax()

Currently, filling with the median, a constant, and the mode are supported.

In [ ]:

#export
class FillMissing(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr()

    def setups(self, to):
        missing = pd.isnull(to.conts).any()
        store_attr(but='to', na_dict={n:self.fill_strategy(to[n], self.fill_vals[n])
                            for n in missing[missing].keys()})
        self.fill_strategy = self.fill_strategy.__name__

    def encodes(self, to):
        missing = pd.isnull(to.conts)
        for n in missing.any()[missing.any()].keys():
            assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
        for n in self.na_dict.keys():
            to[n].fillna(self.na_dict[n], inplace=True)
            if self.add_col:
                to.loc[:,n+'_na'] = missing[n]
                if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')

In [ ]:

show_doc(FillMissing, title_level=3)

`class` `FillMissing`[source]

FillMissing(fill_strategy=median, add_col=True, fill_vals=None) :: TabularProc

Fill the missing values in continuous columns.

In [ ]:

#hide
fill1,fill2,fill3 = (FillMissing(fill_strategy=s) 
                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df1 = df.copy(); df2 = df.copy()
tos = (TabularPandas(df, fill1, cont_names='a'),
       TabularPandas(df1, fill2, cont_names='a'),
       TabularPandas(df2, fill3, cont_names='a'))
test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})

for t in tos: test_eq(t.cat_names, ['a_na'])

for to_,v in zip(tos, [1.5, 0., 1.]):
    test_eq(to_['a'].values, np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(to_['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))

In [ ]:

#hide
fill = FillMissing() 
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
to = TabularPandas(df, fill, cont_names=['a', 'b'])
test_eq(fill.na_dict, {'a': 1.5})
test_eq(to.cat_names, ['a_na'])
test_eq(to['a'].values, np.array([0, 1, 1.5, 1, 2, 3, 4]))
test_eq(to['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))
test_eq(to['b'].values, np.array([0,1,2,3,4,5,6]))

In [ ]:

#hide
fill = FillMissing() 
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
to = TabularPandas(df, fill, cont_names=['a', 'b'])
test_eq(hasattr(to.procs.fill_missing, 'to'), False)

TabularPandas Pipelines -¶

In [ ]:

#hide
procs = [Normalize, Categorify, FillMissing, noop]
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
to = TabularPandas(df, procs, cat_names='a', cont_names='b')

#Test setup and apply on df_main
test_series(to.cat_names, ['a', 'b_na'])
test_series(to['a'], [1,2,3,2,2,3,1])
test_series(to['b_na'], [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})

In [ ]:

#hide
#Test apply on y_names
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_series(to.cat_names, ['a', 'b_na'])
test_series(to['a'], [1,2,3,2,2,3,1])
test_series(to['b_na'], [1,1,2,1,1,1,1])
test_series(to['c'], [1,0,1,0,0,1,0])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
test_eq(to.vocab, ['a','b'])

In [ ]:

#hide
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_series(to.cat_names, ['a', 'b_na'])
test_series(to['a'], [1,2,3,2,2,3,1])
test_eq(df.a.dtype, np.int64 if sys.platform == "win32" else int)
test_series(to['b_na'], [1,1,2,1,1,1,1])
test_series(to['c'], [1,0,1,0,0,1,0])

In [ ]:

#hide
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])

test_series(to.cat_names, ['a', 'b_na'])
test_series(to['a'], [1,2,2,1,0,2,0])
test_eq(df.a.dtype, np.int64 if sys.platform == "win32" else int)
test_series(to['b_na'], [1,2,1,1,1,1,1])
test_series(to['c'], [1,0,0,0,1,0,1])

In [ ]:

#export
def _maybe_expand(o): return o[:,None] if o.ndim==1 else o

In [ ]:

#export
class ReadTabBatch(ItemTransform):
    "Transform `TabularPandas` values into a `Tensor` with the ability to decode"
    def __init__(self, to): self.to = to.new_empty()

    def encodes(self, to):
        if not to.with_cont: res = (tensor(to.cats).long(),)
        else: res = (tensor(to.cats).long(),tensor(to.conts).float())
        ys = [n for n in to.y_names if n in to.items.columns]
        if len(ys) == len(to.y_names): res = res + (tensor(to.targ),)
        if to.device is not None: res = to_device(res, to.device)
        return res

    def decodes(self, o):
        o = [_maybe_expand(o_) for o_ in to_np(o) if o_.size != 0]
        vals = np.concatenate(o, axis=1)
        try: df = pd.DataFrame(vals, columns=self.to.all_col_names)
        except: df = pd.DataFrame(vals, columns=self.to.x_names)
        to = self.to.new(df)
        return to

In [ ]:

#export
@typedispatch
def show_batch(x: Tabular, y, its, max_n=10, ctxs=None):
    x.show()

In [ ]:

#export
@delegates()
class TabDataLoader(TfmdDL):
    "A transformed `DataLoader` for Tabular data"    
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]
    def do_item(self, s):      return 0 if s is None else s

TabularPandas._dl_type = TabDataLoader

Integration example¶

For a more in-depth explanation, see the tabular tutorial

In [ ]:

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_test.drop('salary', axis=1, inplace=True)
df_main.head()

Out[ ]:

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	capital-loss	hours-per-week	native-country	salary
0	49	Private	101320	Assoc-acdm	12.0	Married-civ-spouse	NaN	Wife	White	Female	0	1902	40	United-States	>=50k
1	44	Private	236746	Masters	14.0	Divorced	Exec-managerial	Not-in-family	White	Male	10520	0	45	United-States	>=50k
2	38	Private	96185	HS-grad	NaN	Divorced	NaN	Unmarried	Black	Female	0	0	32	United-States	<50k
3	38	Self-emp-inc	112847	Prof-school	15.0	Married-civ-spouse	Prof-specialty	Husband	Asian-Pac-Islander	Male	0	0	40	United-States	>=50k
4	42	Self-emp-not-inc	82297	7th-8th	NaN	Married-civ-spouse	Other-service	Wife	Black	Female	0	0	50	United-States	<50k

In [ ]:

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [ ]:

to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)

In [ ]:

dls = to.dataloaders()
dls.valid.show_batch()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num	salary
0	Private	HS-grad	Never-married	Handlers-cleaners	Own-child	Black	False	28.000000	335356.999710	9.0	<50k
1	?	HS-grad	Married-civ-spouse	?	Husband	White	False	65.999999	37330.998172	9.0	<50k
2	Private	Masters	Never-married	#na#	Not-in-family	Asian-Pac-Islander	False	32.000000	116137.997932	14.0	<50k
3	Private	HS-grad	Married-civ-spouse	Craft-repair	Husband	White	False	45.000000	273434.998017	9.0	<50k
4	Private	HS-grad	Married-civ-spouse	Craft-repair	Husband	White	False	51.000000	101431.996842	9.0	<50k
5	Private	Bachelors	Married-civ-spouse	Prof-specialty	Husband	White	False	48.000000	332465.003428	13.0	<50k
6	Private	Some-college	Never-married	Sales	Own-child	White	False	17.999999	192409.000024	10.0	<50k
7	Private	HS-grad	Divorced	Machine-op-inspct	Unmarried	Black	True	37.000000	175390.000108	10.0	<50k
8	Private	Bachelors	Married-civ-spouse	Exec-managerial	Husband	White	False	38.000000	192337.000006	13.0	>=50k
9	Federal-gov	HS-grad	Married-civ-spouse	Adm-clerical	Husband	White	False	37.000000	32528.006470	9.0	>=50k

In [ ]:

to.show()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num	salary
279	Private	HS-grad	Never-married	#na#	Own-child	White	True	20.0	155775.0	10.0	<50k
6459	Private	HS-grad	Divorced	Craft-repair	Not-in-family	White	False	55.0	35551.0	9.0	<50k
5544	Private	Assoc-voc	Divorced	Tech-support	Not-in-family	Black	False	53.0	479621.0	11.0	<50k
3500	?	10th	Never-married	?	Not-in-family	White	False	19.0	182590.0	6.0	<50k
3788	Self-emp-not-inc	Bachelors	Married-civ-spouse	Sales	Husband	White	False	31.0	340880.0	13.0	<50k
4002	Self-emp-not-inc	Some-college	Never-married	Sales	Own-child	White	False	30.0	196342.0	10.0	<50k
204	?	HS-grad	Married-civ-spouse	#na#	Husband	White	True	60.0	174073.0	10.0	<50k
9097	Private	HS-grad	Married-civ-spouse	Adm-clerical	Husband	White	False	39.0	83893.0	9.0	>=50k
5972	Private	Bachelors	Married-civ-spouse	Exec-managerial	Husband	White	False	48.0	105838.0	13.0	>=50k
5661	Private	HS-grad	Never-married	Adm-clerical	Own-child	White	False	26.0	262656.0	9.0	<50k

We can decode any set of transformed data by calling to.decode_row with our raw data:

In [ ]:

row = to.items.iloc[0]
to.decode_row(row)

Out[ ]:

age                           20.0
workclass                  Private
fnlwgt                    155775.0
education                  HS-grad
education-num                 10.0
marital-status       Never-married
occupation                    #na#
relationship             Own-child
race                         White
sex                           Male
capital-gain                     0
capital-loss                     0
hours-per-week                  30
native-country       United-States
salary                        <50k
education-num_na              True
Name: 279, dtype: object

We can make new test datasets based on the training data with the to.new()

Note: Since machine learning models can't magically understand categories it was never trained on, the data should reflect this. If there are different missing values in your test data you should address this before training

In [ ]:

to_tst = to.new(df_test)
to_tst.process()
to_tst.items.head()

Out[ ]:

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	hours-per-week	native-country	education-num_na
10000	0.455476	5	1.326789	10	1.178200	3	2	1	2	Male	40	Philippines	1
10001	-0.936297	5	1.240484	12	-0.420714	3	15	1	4	Male	40	United-States	1
10002	1.041486	5	0.146895	2	-1.220171	1	9	2	5	Female	37	United-States	1
10003	0.528727	5	-0.282639	12	-0.420714	7	2	5	5	Female	43	United-States	1
10004	0.748481	6	1.428478	9	0.378743	3	5	1	5	Male	60	United-States	1

We can then convert it to a DataLoader:

In [ ]:

tst_dl = dls.valid.new(to_tst)
tst_dl.show_batch()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num
0	Private	Bachelors	Married-civ-spouse	Adm-clerical	Husband	Asian-Pac-Islander	False	45.000000	338105.001967	13.0
1	Private	HS-grad	Married-civ-spouse	Transport-moving	Husband	Other	False	26.000000	328663.005601	9.0
2	Private	11th	Divorced	Other-service	Not-in-family	White	False	53.000000	209021.999795	7.0
3	Private	HS-grad	Widowed	Adm-clerical	Unmarried	White	False	46.000000	162029.999497	9.0
4	Self-emp-inc	Assoc-voc	Married-civ-spouse	Exec-managerial	Husband	White	False	49.000000	349229.997780	11.0
5	Local-gov	Some-college	Married-civ-spouse	Exec-managerial	Husband	White	False	34.000000	124827.002450	10.0
6	Self-emp-inc	Some-college	Married-civ-spouse	Sales	Husband	White	False	53.000000	290640.001644	10.0
7	Private	Some-college	Never-married	Sales	Own-child	White	False	19.000000	106272.998740	10.0
8	Private	Some-college	Married-civ-spouse	Protective-serv	Husband	Black	False	72.000001	53684.003462	10.0
9	Private	Some-college	Never-married	Sales	Own-child	White	False	20.000000	505980.007069	10.0

Other target types¶

Multi-label categories¶

one-hot encoded label¶

In [ ]:

def _mock_multi_label(df):
    sal,sex,white = [],[],[]
    for row in df.itertuples():
        sal.append(row.salary == '>=50k')
        sex.append(row.sex == ' Male')
        white.append(row.race == ' White')
    df['salary'] = np.array(sal)
    df['male']   = np.array(sex)
    df['white']  = np.array(white)
    return df

In [ ]:

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

In [ ]:

df_main.head()

Out[ ]:

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	capital-loss	hours-per-week	native-country	salary	male	white
0	49	Private	101320	Assoc-acdm	12.0	Married-civ-spouse	NaN	Wife	White	Female	0	1902	40	United-States	True	False	True
1	44	Private	236746	Masters	14.0	Divorced	Exec-managerial	Not-in-family	White	Male	10520	0	45	United-States	True	True	True
2	38	Private	96185	HS-grad	NaN	Divorced	NaN	Unmarried	Black	Female	0	0	32	United-States	False	False	False
3	38	Self-emp-inc	112847	Prof-school	15.0	Married-civ-spouse	Prof-specialty	Husband	Asian-Pac-Islander	Male	0	0	40	United-States	True	True	False
4	42	Self-emp-not-inc	82297	7th-8th	NaN	Married-civ-spouse	Other-service	Wife	Black	Female	0	0	50	United-States	False	False	False

In [ ]:

#exporti
@EncodedMultiCategorize
def setups(self, to:Tabular):
    self.c = len(self.vocab)
    return self(to)

@EncodedMultiCategorize
def encodes(self, to:Tabular): return to

@EncodedMultiCategorize
def decodes(self, to:Tabular):
    to.transform(to.y_names, lambda c: c==1)
    return to

In [ ]:

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]

In [ ]:

%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names=y_names, y_block=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits)

CPU times: user 60 ms, sys: 0 ns, total: 60 ms
Wall time: 59.4 ms

In [ ]:

dls = to.dataloaders()
dls.valid.show_batch()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num	salary	male	white
0	Private	HS-grad	Married-civ-spouse	Sales	Husband	White	False	47.000000	186533.999848	9.0	True	True	True
1	Private	Some-college	Never-married	Adm-clerical	Not-in-family	White	False	32.000000	115631.001216	10.0	False	False	True
2	Federal-gov	Some-college	Widowed	Exec-managerial	Not-in-family	White	False	60.000001	27466.003873	10.0	False	False	True
3	Private	HS-grad	Never-married	Other-service	Not-in-family	White	False	49.000000	129639.997602	9.0	False	False	True
4	Local-gov	Prof-school	Married-civ-spouse	Prof-specialty	Husband	White	False	37.000000	265038.001582	15.0	True	True	True
5	Private	Bachelors	Never-married	Handlers-cleaners	Other-relative	White	False	23.000001	256755.002929	13.0	False	False	True
6	Private	HS-grad	Never-married	Machine-op-inspct	Not-in-family	White	False	39.000000	185052.999958	9.0	False	False	True
7	Private	HS-grad	Never-married	Handlers-cleaners	Own-child	White	False	28.000000	189346.000139	9.0	False	True	True
8	Private	10th	Married-civ-spouse	Other-service	Husband	Asian-Pac-Islander	False	35.000000	176122.999494	6.0	False	True	False
9	Private	5th-6th	Never-married	Machine-op-inspct	Other-relative	White	False	25.000000	521399.996882	3.0	False	True	True

Not one-hot encoded¶

In [ ]:

def _mock_multi_label(df):
    targ = []
    for row in df.itertuples():
        labels = []
        if row.salary == '>=50k': labels.append('>50k')
        if row.sex == ' Male':   labels.append('male')
        if row.race == ' White': labels.append('white')
        targ.append(' '.join(labels))
    df['target'] = np.array(targ)
    return df

In [ ]:

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

In [ ]:

df_main.head()

Out[ ]:

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	capital-loss	hours-per-week	native-country	salary	target
0	49	Private	101320	Assoc-acdm	12.0	Married-civ-spouse	NaN	Wife	White	Female	0	1902	40	United-States	>=50k	>50k white
1	44	Private	236746	Masters	14.0	Divorced	Exec-managerial	Not-in-family	White	Male	10520	0	45	United-States	>=50k	>50k male white
2	38	Private	96185	HS-grad	NaN	Divorced	NaN	Unmarried	Black	Female	0	0	32	United-States	<50k
3	38	Self-emp-inc	112847	Prof-school	15.0	Married-civ-spouse	Prof-specialty	Husband	Asian-Pac-Islander	Male	0	0	40	United-States	>=50k	>50k male
4	42	Self-emp-not-inc	82297	7th-8th	NaN	Married-civ-spouse	Other-service	Wife	Black	Female	0	0	50	United-States	<50k

In [ ]:

@MultiCategorize
def encodes(self, to:Tabular): 
    #to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
    return to
  
@MultiCategorize
def decodes(self, to:Tabular): 
    #to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
    return to

In [ ]:

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [ ]:

%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="target", y_block=MultiCategoryBlock(), splits=splits)

CPU times: user 68 ms, sys: 0 ns, total: 68 ms
Wall time: 65 ms

In [ ]:

to.procs[2].vocab

Out[ ]:

['-', '_', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']

Regression¶

In [ ]:

#exporti
@RegressionSetup
def setups(self, to:Tabular):
    if self.c is not None: return
    self.c = len(to.y_names)
    return to

@RegressionSetup
def encodes(self, to:Tabular): return to

@RegressionSetup
def decodes(self, to:Tabular): return to

In [ ]:

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

In [ ]:

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [ ]:

%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='age', splits=splits)

CPU times: user 60 ms, sys: 4 ms, total: 64 ms
Wall time: 63.3 ms

In [ ]:

to.procs[-1].means

Out[ ]:

{'fnlwgt': 192492.332875, 'education-num': 10.075499534606934}

In [ ]:

dls = to.dataloaders()
dls.valid.show_batch()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	fnlwgt	education-num	age
0	Private	9th	Married-civ-spouse	Machine-op-inspct	Husband	White	False	288185.002301	5.0	25.0
1	Self-emp-inc	HS-grad	Married-civ-spouse	Craft-repair	Husband	White	False	383492.997753	9.0	44.0
2	Private	HS-grad	Married-civ-spouse	Craft-repair	Husband	White	False	84136.001920	9.0	40.0
3	Private	Bachelors	Never-married	Handlers-cleaners	Own-child	White	True	31778.002656	10.0	28.0
4	Private	Some-college	Married-civ-spouse	Adm-clerical	Husband	Black	False	193036.000001	10.0	34.0
5	Private	10th	Divorced	Machine-op-inspct	Not-in-family	Black	False	131713.998819	6.0	29.0
6	Private	HS-grad	Married-civ-spouse	Machine-op-inspct	Husband	White	False	275632.002074	9.0	30.0
7	Private	HS-grad	Married-civ-spouse	Other-service	Husband	White	False	107236.003015	9.0	27.0
8	Private	HS-grad	Married-civ-spouse	Machine-op-inspct	Husband	Black	False	83878.997816	9.0	28.0
9	Private	7th-8th	Never-married	Handlers-cleaners	Own-child	White	False	255476.000025	4.0	29.0

In [ ]:

class TensorTabular(fastuple):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self[0].shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

class TabularLine(pd.Series):
    "A line of a dataframe that knows how to show itself"
    def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)

class ReadTabLine(ItemTransform):
    def __init__(self, proc): self.proc = proc

    def encodes(self, row):
        cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
        return TensorTabular(tensor(cats).long(),tensor(conts).float())

    def decodes(self, o):
        to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
        to = self.proc.decode(to)
        return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))

class ReadTabTarget(ItemTransform):
    def __init__(self, proc): self.proc = proc
    def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
    def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])

In [ ]:

# tds = TfmdDS(to.items, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)])
# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)

# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')

# test_stdout(lambda: print(show_at(tds, 1)), """a               1
# b_na        False
# b               1
# category        a
# dtype: object""")

Export -¶

In [ ]:

#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 01a_losses.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 10b_tutorial.albumentations.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 18a_callback.training.ipynb.
Converted 18b_callback.preds.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 21_vision.learner.ipynb.
Converted 22_tutorial.imagenette.ipynb.
Converted 23_tutorial.vision.ipynb.
Converted 24_tutorial.siamese.ipynb.
Converted 24_vision.gan.ipynb.
Converted 30_text.core.ipynb.
Converted 31_text.data.ipynb.
Converted 32_text.models.awdlstm.ipynb.
Converted 33_text.models.core.ipynb.
Converted 34_callback.rnn.ipynb.
Converted 35_tutorial.wikitext.ipynb.
Converted 36_text.models.qrnn.ipynb.
Converted 37_text.learner.ipynb.
Converted 38_tutorial.text.ipynb.
Converted 39_tutorial.transformers.ipynb.
Converted 40_tabular.core.ipynb.
Converted 41_tabular.data.ipynb.
Converted 42_tabular.model.ipynb.
Converted 43_tabular.learner.ipynb.
Converted 44_tutorial.tabular.ipynb.
Converted 45_collab.ipynb.
Converted 46_tutorial.collab.ipynb.
Converted 50_tutorial.datablock.ipynb.
Converted 60_medical.imaging.ipynb.
Converted 61_tutorial.medical_imaging.ipynb.
Converted 65_medical.text.ipynb.
Converted 70_callback.wandb.ipynb.
Converted 71_callback.tensorboard.ipynb.
Converted 72_callback.neptune.ipynb.
Converted 73_callback.captum.ipynb.
Converted 74_callback.azureml.ipynb.
Converted 97_test_utils.ipynb.
Converted 99_pytorch_doc.ipynb.
Converted dev-setup.ipynb.
Converted index.ipynb.
Converted quick_start.ipynb.
Converted tutorial.ipynb.

In [ ]:

Tabular core¶

Initial preprocessing¶

df_shrink_dtypes[source]

df_shrink[source]

Tabular -¶

class Categorify[source]

class FillMissing[source]

TabularPandas Pipelines -¶

Integration example¶

Other target types¶

Multi-label categories¶

one-hot encoded label¶

Not one-hot encoded¶

Regression¶

Not being used now - for multi-modal¶

Export -¶

`df_shrink_dtypes`[source]

`df_shrink`[source]

`class` `Categorify`[source]

`class` `FillMissing`[source]