fastai
and version info:¶pip install fastai --quiet
pip show fastai fastcore
Name: fastai Version: 0.0.17 Summary: Version 2 of the fastai library Home-page: https://github.com/fastai/fastai Author: Jeremy Howard, Sylvain Gugger, and contributors Author-email: info@fast.ai License: Apache Software License 2.0 Location: /usr/local/lib/python3.6/dist-packages Requires: requests, pillow, torchvision, pandas, spacy, pyyaml, fastprogress, scipy, scikit-learn, fastcore, torch, matplotlib Required-by: --- Name: fastcore Version: 0.1.17 Summary: Python supercharged for fastai development Home-page: https://github.com/fastai/fastcore Author: Jeremy Howard and Sylvain Gugger Author-email: infos@fast.ai License: Apache Software License 2.0 Location: /usr/local/lib/python3.6/dist-packages Requires: numpy, dataclasses Required-by: fastai
from fastai.tabular.all import *
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
splits = RandomSplitter()(range_of(df))
TabularPandas
¶to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
y_names=y_names, splits=splits)
Raw x's and y's:
to.train.xs.iloc[:3]
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | |
---|---|---|---|---|---|---|---|---|---|---|
27147 | 5 | 8 | 5 | 14 | 2 | 5 | 1 | -0.631822 | -0.972539 | 0.751850 |
20557 | 5 | 12 | 1 | 11 | 2 | 5 | 1 | -0.705017 | -1.500515 | -0.424423 |
5537 | 5 | 10 | 3 | 13 | 1 | 5 | 1 | 0.026942 | -0.122164 | 1.143940 |
to.train.ys.iloc[:3]
salary | |
---|---|
23736 | 0 |
24771 | 0 |
6144 | 0 |
from IPython.utils import io as io_p
def get_b_w(t):
best = round(t.best*1000, 2)
worst = round(t.worst*1000, 2)
return best, worst
def get_avg(a, b, dl):
best = round(a/len(dl), 2)
worst = round(b/len(dl), 2)
return best, worst
fastai
DataLoader¶dls = to.dataloaders(bs=128, device='cpu')
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
t = %timeit -o next(iter(dls.train)) # Time getting first batch
best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o next(iter(dls.valid))
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.train: pass # Time going over all batches
best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.valid: pass
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout
print(out)
Type = NumPy Device = cpu Batch Size = 128 First Batch: `train`: Best: 19.43ms, Worst: 20.96ms `valid`: Best: 3.59ms, Worst: 3.69ms All Batches: `train`: Best: 703.5ms, Worst: 726.88ms `valid`: Best: 170.05ms, Worst: 176.59ms Average Per Batch: `train`: Best: 3.47ms/batch, Worst: 3.58ms/batch `valid`: Best: 3.33ms/batch, Worst: 3.46ms/batch
dls.device = 'cuda'
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = fastai\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
t = %timeit -o next(iter(dls.train)) # Time getting first batch
best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o next(iter(dls.valid))
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.train: pass # Time going over all batches
best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.valid: pass
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout
print(out)
Type = fastai Device = cuda Batch Size = 128 First Batch: `train`: Best: 20.0ms, Worst: 2852.74ms `valid`: Best: 3.6ms, Worst: 4.01ms All Batches: `train`: Best: 711.13ms, Worst: 723.7ms `valid`: Best: 174.96ms, Worst: 180.12ms Average Per Batch: `train`: Best: 3.5ms/batch, Worst: 3.57ms/batch `valid`: Best: 3.43ms/batch, Worst: 3.53ms/batch
DL
¶class TabDataset():
"A `NumPy` dataset from a `TabularPandas` object"
def __init__(self, to):
self.cats = to.cats.to_numpy().astype(np.long)
self.conts = to.conts.to_numpy().astype(np.float32)
self.ys = to.ys.to_numpy()
def __getitem__(self, idx):
idx = idx[0]
return self.cats[idx:idx+self.bs], self.conts[idx:idx+self.bs], self.ys[idx:idx+self.bs]
def __len__(self): return len(self.cats)
train_ds = TabDataset(to.train)
valid_ds = TabDataset(to.valid)
class TabDataLoader(DataLoader):
def __init__(self, dataset, bs=1, num_workers=0, device='cuda', shuffle=False, **kwargs):
"A `DataLoader` based on a `TabDataset`"
super().__init__(dataset, bs=bs, num_workers=num_workers, shuffle=shuffle,
device=device, drop_last=shuffle, **kwargs)
self.dataset.bs=bs
def create_item(self, s): return s
def create_batch(self, b):
cat, cont, y = self.dataset[b]
return tensor(cat).to(self.device), tensor(cont).to(self.device), tensor(y).to(self.device)
train_dl = TabDataLoader(train_ds, bs=128, shuffle=False)
valid_dl = TabDataLoader(train_ds, bs=128, shuffle=False)
dls = DataLoaders(train_dl, valid_dl, device='cpu')
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
t = %timeit -o next(iter(dls.train)) # Time getting first batch
best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o next(iter(dls.valid))
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.train: pass # Time going over all batches
best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.valid: pass
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout
print(out)
Type = NumPy Device = cpu Batch Size = 128 First Batch: `train`: Best: 0.89ms, Worst: 1.34ms `valid`: Best: 0.9ms, Worst: 1.03ms All Batches: `train`: Best: 31.86ms, Worst: 34.0ms `valid`: Best: 32.57ms, Worst: 40.9ms Average Per Batch: `train`: Best: 0.16ms/batch, Worst: 0.17ms/batch `valid`: Best: 0.16ms/batch, Worst: 0.2ms/batch
dls.device = 'cuda'
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
t = %timeit -o next(iter(dls.train)) # Time getting first batch
best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o next(iter(dls.valid))
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.train: pass # Time going over all batches
best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.valid: pass
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout
print(out)
Type = NumPy Device = cuda Batch Size = 128 First Batch: `train`: Best: 1.03ms, Worst: 1.7ms `valid`: Best: 1.03ms, Worst: 1.13ms All Batches: `train`: Best: 52.53ms, Worst: 54.39ms `valid`: Best: 53.38ms, Worst: 60.92ms Average Per Batch: `train`: Best: 0.26ms/batch, Worst: 0.27ms/batch `valid`: Best: 0.26ms/batch, Worst: 0.3ms/batch
shuffle_fn
¶# Don't run
def shuffle_fn(self, idxs): return self.rng.sample(idxs, len(idxs))
def randomize(self): self.rng = random.Random(self.rng.randint(0,2**32-1))
@patch
def shuffle_fn(x:TabDataLoader):
"Shuffle the interior dataset"
rng = np.random.permutation(len(x.dataset))
x.dataset.cats = x.dataset.cats[rng]
x.dataset.conts = x.dataset.conts[rng]
x.dataset.ys = x.dataset.ys[rng]
get_idxs
¶# Don't run
def get_idxs(self):
idxs = Inf.count if self.indexed else Inf.nones
if self.n is not None: idxs = list(itertools.islice(idxs, self.n))
if self.shuffle: idxs = self.shuffle_fn(idxs)
return idxs
@patch
def get_idxs(x:TabDataLoader):
"Get index's to select"
idxs = Inf.count if x.indexed else Inf.nones
if x.n is not None: idxs = list(range(len(x.dataset)))
if x.shuffle: x.shuffle_fn()
return idxs
train_dl = TabDataLoader(train_ds, shuffle=True, bs=128)
valid_dl = TabDataLoader(valid_ds, bs=128)
dls = DataLoaders(train_dl, valid_dl, device='cpu')
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
t = %timeit -o next(iter(dls.train)) # Time getting first batch
best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o next(iter(dls.valid))
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.train: pass # Time going over all batches
best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.valid: pass
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
dls_f = to.dataloaders(bs=128, device='cpu')
print(f'Type = fastai\nDevice = {dls_f.device}\nBatch Size = {dls_f.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
t = %timeit -o next(iter(dls_f.train)) # Time getting first batch
best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o next(iter(dls_f.valid))
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls_f.train: pass # Time going over all batches
best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls_f.train)
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls_f.valid: pass
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls_f.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out2 = new_stdout.getvalue()
sys.stdout = old_stdout
print(out)
Type = NumPy Device = cpu Batch Size = 128 First Batch: `train`: Best: 2.4ms, Worst: 4.49ms `valid`: Best: 0.3ms, Worst: 0.44ms All Batches: `train`: Best: 33.32ms, Worst: 39.55ms `valid`: Best: 7.84ms, Worst: 8.0ms Average Per Batch: `train`: Best: 0.16ms/batch, Worst: 0.19ms/batch `valid`: Best: 0.15ms/batch, Worst: 0.16ms/batch
print(out2)
Type = fastai Device = cpu Batch Size = 128 First Batch: `train`: Best: 18.73ms, Worst: 22.48ms `valid`: Best: 3.51ms, Worst: 3.67ms All Batches: `train`: Best: 683.69ms, Worst: 693.98ms `valid`: Best: 163.9ms, Worst: 178.87ms Average Per Batch: `train`: Best: 3.37ms/batch, Worst: 3.42ms/batch `valid`: Best: 3.21ms/batch, Worst: 3.51ms/batch
dls.device = 'cuda'
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
t = %timeit -o next(iter(dls.train)) # Time getting first batch
best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o next(iter(dls.valid))
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.train: pass # Time going over all batches
best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls.valid: pass
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
dls_f.device = 'cuda'
print(f'Type = fastai\nDevice = {dls_f.device}\nBatch Size = {dls_f.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
t = %timeit -o next(iter(dls_f.train)) # Time getting first batch
best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o next(iter(dls_f.valid))
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls_f.train: pass # Time going over all batches
best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls_f.train)
with io_p.capture_output() as captured:
t = %timeit -o for _ in dls_f.valid: pass
best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls_f.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out2 = new_stdout.getvalue()
sys.stdout = old_stdout
print(out)
Type = NumPy Device = cuda Batch Size = 128 First Batch: `train`: Best: 2.5ms, Worst: 9.24ms `valid`: Best: 0.44ms, Worst: 0.62ms All Batches: `train`: Best: 48.83ms, Worst: 49.51ms `valid`: Best: 13.04ms, Worst: 13.57ms Average Per Batch: `train`: Best: 0.24ms/batch, Worst: 0.24ms/batch `valid`: Best: 0.26ms/batch, Worst: 0.27ms/batch
print(out2)
Type = fastai Device = cuda Batch Size = 128 First Batch: `train`: Best: 19.31ms, Worst: 22.86ms `valid`: Best: 3.57ms, Worst: 3.75ms All Batches: `train`: Best: 713.25ms, Worst: 745.08ms `valid`: Best: 170.11ms, Worst: 179.02ms Average Per Batch: `train`: Best: 3.51ms/batch, Worst: 3.67ms/batch `valid`: Best: 3.34ms/batch, Worst: 3.51ms/batch