#hide
#skip
! [ -e /content ] && pip install -Uqq fastai # upgrade fastai on colab
#default_exp data.external
#export
from fastai.torch_basics import *
from fastdownload import FastDownload
from functools import lru_cache
import fastai.data
Helper functions to download the fastai datasets
To download any of the datasets or pretrained weights, simply run untar_data
by passing any dataset name mentioned above like so:
path = untar_data(URLs.PETS)
path.ls()
>> (#7393) [Path('/home/ubuntu/.fastai/data/oxford-iiit-pet/images/keeshond_34.jpg'),...]
To download model pretrained weights:
path = untar_data(URLs.PETS)
path.ls()
>> (#2) [Path('/home/ubuntu/.fastai/data/wt103-bwd/itos_wt103.pkl'),Path('/home/ubuntu/.fastai/data/wt103-bwd/lstm_bwd.pth')]
A complete list of datasets that are available by default inside the library are:
HUMAN_NUMBERS: A synthetic dataset consisting of human number counts in text such as one, two, three, four.. Useful for experimenting with Language Models.
IMDB: The full IMDB sentiment analysis dataset.
IMDB_SAMPLE: A sample of the full IMDB sentiment analysis dataset.
ML_SAMPLE: A movielens sample dataset for recommendation engines to recommend movies to users.
ML_100k: The movielens 100k dataset for recommendation engines to recommend movies to users.
MNIST_SAMPLE: A sample of the famous MNIST dataset consisting of handwritten digits.
MNIST_TINY: A tiny version of the famous MNIST dataset consisting of handwritten digits.
MNIST_VAR_SIZE_TINY:
PLANET_SAMPLE: A sample of the planets dataset from the Kaggle competition Planet: Understanding the Amazon from Space.
PLANET_TINY: A tiny version of the planets dataset from the Kaggle competition Planet: Understanding the Amazon from Space for faster experimentation and prototyping.
IMAGENETTE: A smaller version of the imagenet dataset pronounced just like 'Imagenet', except with a corny inauthentic French accent.
IMAGENETTE_160: The 160px version of the Imagenette dataset.
IMAGENETTE_320: The 320px version of the Imagenette dataset.
IMAGEWOOF: Imagewoof is a subset of 10 classes from Imagenet that aren't so easy to classify, since they're all dog breeds.
IMAGEWOOF_160: 160px version of the ImageWoof dataset.
IMAGEWOOF_320: 320px version of the ImageWoof dataset.
IMAGEWANG: Imagewang contains Imagenette and Imagewoof combined, but with some twists that make it into a tricky semi-supervised unbalanced classification problem
IMAGEWANG_160: 160px version of Imagewang.
IMAGEWANG_320: 320px version of Imagewang.
SIIM_SMALL: A smaller version of the SIIM dataset where the objective is to classify pneumothorax from a set of chest radiographic images.
TCGA_SMALL: A smaller version of the TCGA-OV dataset with subcutaneous and visceral fat segmentations. Citations:
Holback, C., Jarosz, R., Prior, F., Mutch, D. G., Bhosale, P., Garcia, K., … Erickson, B. J. (2016). Radiology Data from The Cancer Genome Atlas Ovarian Cancer [TCGA-OV] collection. The Cancer Imaging Archive. http://doi.org/10.7937/K9/TCIA.2016.NDO1MDFQ
Clark K, Vendt B, Smith K, Freymann J, Kirby J, Koppel P, Moore S, Phillips S, Maffitt D, Pringle M, Tarbox L, Prior F. The Cancer Imaging Archive (TCIA): Maintaining and Operating a Public Information Repository, Journal of Digital Imaging, Volume 26, Number 6, December, 2013, pp 1045-1057. https://link.springer.com/article/10.1007/s10278-013-9622-7
# export
@lru_cache(maxsize=None)
def fastai_cfg():
"`Config` object for fastai's `config.ini`"
return Config(Path(os.getenv('FASTAI_HOME', '~/.fastai')), 'config.ini', create=dict(
data = 'data', archive = 'archive', storage = 'tmp', model = 'models'))
This is a basic Config
file that consists of data
, model
, storage
and archive
.
All future downloads occur at the paths defined in the config file based on the type of download. For example, all future fastai datasets are downloaded to the data
while all pretrained model weights are download to model
unless the default download location is updated.
cfg = fastai_cfg()
cfg.data,cfg.path('data')
('data', Path('/home/jhoward/.fastai/data'))
# export
def fastai_path(folder):
"Path to `folder` in `fastai_cfg`"
return fastai_cfg().path(folder)
fastai_path('archive')
Path('/home/jhoward/.fastai/archive')
#export
class URLs():
"Global constants for dataset and model URLs."
LOCAL_PATH = Path.cwd()
MDL = 'http://files.fast.ai/models/'
GOOGLE = 'https://storage.googleapis.com/'
S3 = 'https://s3.amazonaws.com/fast-ai-'
URL = f'{S3}sample/'
S3_IMAGE = f'{S3}imageclas/'
S3_IMAGELOC = f'{S3}imagelocal/'
S3_AUDI = f'{S3}audio/'
S3_NLP = f'{S3}nlp/'
S3_COCO = f'{S3}coco/'
S3_MODEL = f'{S3}modelzoo/'
# main datasets
ADULT_SAMPLE = f'{URL}adult_sample.tgz'
BIWI_SAMPLE = f'{URL}biwi_sample.tgz'
CIFAR = f'{URL}cifar10.tgz'
COCO_SAMPLE = f'{S3_COCO}coco_sample.tgz'
COCO_TINY = f'{S3_COCO}coco_tiny.tgz'
HUMAN_NUMBERS = f'{URL}human_numbers.tgz'
IMDB = f'{S3_NLP}imdb.tgz'
IMDB_SAMPLE = f'{URL}imdb_sample.tgz'
ML_SAMPLE = f'{URL}movie_lens_sample.tgz'
ML_100k = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
MNIST_SAMPLE = f'{URL}mnist_sample.tgz'
MNIST_TINY = f'{URL}mnist_tiny.tgz'
MNIST_VAR_SIZE_TINY = f'{S3_IMAGE}mnist_var_size_tiny.tgz'
PLANET_SAMPLE = f'{URL}planet_sample.tgz'
PLANET_TINY = f'{URL}planet_tiny.tgz'
IMAGENETTE = f'{S3_IMAGE}imagenette2.tgz'
IMAGENETTE_160 = f'{S3_IMAGE}imagenette2-160.tgz'
IMAGENETTE_320 = f'{S3_IMAGE}imagenette2-320.tgz'
IMAGEWOOF = f'{S3_IMAGE}imagewoof2.tgz'
IMAGEWOOF_160 = f'{S3_IMAGE}imagewoof2-160.tgz'
IMAGEWOOF_320 = f'{S3_IMAGE}imagewoof2-320.tgz'
IMAGEWANG = f'{S3_IMAGE}imagewang.tgz'
IMAGEWANG_160 = f'{S3_IMAGE}imagewang-160.tgz'
IMAGEWANG_320 = f'{S3_IMAGE}imagewang-320.tgz'
# kaggle competitions download dogs-vs-cats -p {DOGS.absolute()}
DOGS = f'{URL}dogscats.tgz'
# image classification datasets
CALTECH_101 = f'{S3_IMAGE}caltech_101.tgz'
CARS = f'{S3_IMAGE}stanford-cars.tgz'
CIFAR_100 = f'{S3_IMAGE}cifar100.tgz'
CUB_200_2011 = f'{S3_IMAGE}CUB_200_2011.tgz'
FLOWERS = f'{S3_IMAGE}oxford-102-flowers.tgz'
FOOD = f'{S3_IMAGE}food-101.tgz'
MNIST = f'{S3_IMAGE}mnist_png.tgz'
PETS = f'{S3_IMAGE}oxford-iiit-pet.tgz'
# NLP datasets
AG_NEWS = f'{S3_NLP}ag_news_csv.tgz'
AMAZON_REVIEWS = f'{S3_NLP}amazon_review_full_csv.tgz'
AMAZON_REVIEWS_POLARITY = f'{S3_NLP}amazon_review_polarity_csv.tgz'
DBPEDIA = f'{S3_NLP}dbpedia_csv.tgz'
MT_ENG_FRA = f'{S3_NLP}giga-fren.tgz'
SOGOU_NEWS = f'{S3_NLP}sogou_news_csv.tgz'
WIKITEXT = f'{S3_NLP}wikitext-103.tgz'
WIKITEXT_TINY = f'{S3_NLP}wikitext-2.tgz'
YAHOO_ANSWERS = f'{S3_NLP}yahoo_answers_csv.tgz'
YELP_REVIEWS = f'{S3_NLP}yelp_review_full_csv.tgz'
YELP_REVIEWS_POLARITY = f'{S3_NLP}yelp_review_polarity_csv.tgz'
# Image localization datasets
BIWI_HEAD_POSE = f"{S3_IMAGELOC}biwi_head_pose.tgz"
CAMVID = f'{S3_IMAGELOC}camvid.tgz'
CAMVID_TINY = f'{URL}camvid_tiny.tgz'
LSUN_BEDROOMS = f'{S3_IMAGE}bedroom.tgz'
PASCAL_2007 = f'{S3_IMAGELOC}pascal_2007.tgz'
PASCAL_2012 = f'{S3_IMAGELOC}pascal_2012.tgz'
# Audio classification datasets
MACAQUES = f'{GOOGLE}ml-animal-sounds-datasets/macaques.zip'
ZEBRA_FINCH = f'{GOOGLE}ml-animal-sounds-datasets/zebra_finch.zip'
# Medical Imaging datasets
#SKIN_LESION = f'{S3_IMAGELOC}skin_lesion.tgz'
SIIM_SMALL = f'{S3_IMAGELOC}siim_small.tgz'
TCGA_SMALL = f'{S3_IMAGELOC}tcga_small.tgz'
#Pretrained models
OPENAI_TRANSFORMER = f'{S3_MODEL}transformer.tgz'
WT103_FWD = f'{S3_MODEL}wt103-fwd.tgz'
WT103_BWD = f'{S3_MODEL}wt103-bwd.tgz'
def path(url='.', c_key='archive'):
"Return local path where to download based on `c_key`"
fname = url.split('/')[-1]
local_path = URLs.LOCAL_PATH/('models' if c_key=='model' else 'data')/fname
if local_path.exists(): return local_path
return fastai_path(c_key)/fname
The default local path is at ~/.fastai/archive/
but this can be updated by passing a different c_key
. Note: c_key
should be one of 'archive'', 'data', 'model', 'storage'
.
url = URLs.PETS
local_path = URLs.path(url)
test_eq(local_path.parent, fastai_path('archive'))
local_path
Path('/home/jhoward/.fastai/archive/oxford-iiit-pet.tgz')
local_path = URLs.path(url, c_key='model')
test_eq(local_path.parent, fastai_path('model'))
local_path
Path('/home/jhoward/.fastai/models/oxford-iiit-pet.tgz')
#export
def untar_data(url, archive=None, data=None, c_key='data', force_download=False):#, extract_func=file_extract, timeout=4):
"Download `url` to `fname` if `dest` doesn't exist, and extract to folder `dest`"
d = FastDownload(fastai_cfg(), module=fastai.data, archive=archive, data=data, base='~/.fastai')
return d.get(url, force=force_download, extract_key=c_key)
untar_data
is a thin wrapper for FastDownload.get
. It downloads and extracts url
, by default to subdirectories of ~/.fastai
, and returns the path to the extracted data. Setting the force_download
flag to 'True' will overwrite any existing copy of the data already present. For an explanation of the c_key
parameter, see URLs
.
untar_data(URLs.MNIST_SAMPLE)
Path('/home/jhoward/.fastai/data/mnist_sample')
#hide
#Check all URLs are in the download_checks.py file and match for downloaded archives
# from fastdownload import read_checks
# fd = FastDownload(fastai_cfg(), module=fastai.data)
# _whitelist = "MDL LOCAL_PATH URL WT103_BWD WT103_FWD GOOGLE".split()
# checks = read_checks(fd.module)
# for d in dir(URLs):
# if d.upper() == d and not d.startswith("S3") and not d in _whitelist:
# url = getattr(URLs, d)
# assert url in checks,f"""{d} is not in the check file for all URLs.
# To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below):
# url = URLs.{d}
# fd.get(url, force=True)
# fd.update(url)
# """
# f = fd.download(url)
# assert fd.check(url, f),f"""The log we have for {d} in checks does not match the actual archive.
# To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below):
# url = URLs.{d}
# _add_check(url, URLs.path(url))
# """
#hide
from nbdev.export import notebook2script
notebook2script()
Converted 00_torch_core.ipynb. Converted 01_layers.ipynb. Converted 01a_losses.ipynb. Converted 02_data.load.ipynb. Converted 03_data.core.ipynb. Converted 04_data.external.ipynb. Converted 05_data.transforms.ipynb. Converted 06_data.block.ipynb. Converted 07_vision.core.ipynb. Converted 08_vision.data.ipynb. Converted 09_vision.augment.ipynb. Converted 09b_vision.utils.ipynb. Converted 09c_vision.widgets.ipynb. Converted 10_tutorial.pets.ipynb. Converted 10b_tutorial.albumentations.ipynb. Converted 11_vision.models.xresnet.ipynb. Converted 12_optimizer.ipynb. Converted 13_callback.core.ipynb. Converted 13a_learner.ipynb. Converted 13b_metrics.ipynb. Converted 14_callback.schedule.ipynb. Converted 14a_callback.data.ipynb. Converted 15_callback.hook.ipynb. Converted 15a_vision.models.unet.ipynb. Converted 16_callback.progress.ipynb. Converted 17_callback.tracker.ipynb. Converted 18_callback.fp16.ipynb. Converted 18a_callback.training.ipynb. Converted 18b_callback.preds.ipynb. Converted 19_callback.mixup.ipynb. Converted 20_interpret.ipynb. Converted 20a_distributed.ipynb. Converted 21_vision.learner.ipynb. Converted 22_tutorial.imagenette.ipynb. Converted 23_tutorial.vision.ipynb. Converted 24_tutorial.image_sequence.ipynb. Converted 24_tutorial.siamese.ipynb. Converted 24_vision.gan.ipynb. Converted 30_text.core.ipynb. Converted 31_text.data.ipynb. Converted 32_text.models.awdlstm.ipynb. Converted 33_text.models.core.ipynb. Converted 34_callback.rnn.ipynb. Converted 35_tutorial.wikitext.ipynb. Converted 37_text.learner.ipynb. Converted 38_tutorial.text.ipynb. Converted 39_tutorial.transformers.ipynb. Converted 40_tabular.core.ipynb. Converted 41_tabular.data.ipynb. Converted 42_tabular.model.ipynb. Converted 43_tabular.learner.ipynb. Converted 44_tutorial.tabular.ipynb. Converted 45_collab.ipynb. Converted 46_tutorial.collab.ipynb. Converted 50_tutorial.datablock.ipynb. Converted 60_medical.imaging.ipynb. Converted 61_tutorial.medical_imaging.ipynb. Converted 65_medical.text.ipynb. Converted 70_callback.wandb.ipynb. Converted 71_callback.tensorboard.ipynb. Converted 72_callback.neptune.ipynb. Converted 73_callback.captum.ipynb. Converted 74_callback.azureml.ipynb. Converted 97_test_utils.ipynb. Converted 99_pytorch_doc.ipynb. Converted dev-setup.ipynb. Converted app_examples.ipynb. Converted camvid.ipynb. Converted migrating_catalyst.ipynb. Converted migrating_ignite.ipynb. Converted migrating_lightning.ipynb. Converted migrating_pytorch.ipynb. Converted migrating_pytorch_verbose.ipynb. Converted ulmfit.ipynb. Converted index.ipynb. Converted quick_start.ipynb. Converted tutorial.ipynb.