#!/usr/bin/env python # coding: utf-8 # # Getting data/Kaggle # In order to enter a Kaggle competition, or even just work with the images, we must first get the data. # ## Getting data # # There are a couple of ways to get data. # # ### Option 1: Kaggle-cli # To download the official [Kaggle](www.kaggle.com) datasets, the easiest way is to use the [Kaggle cli](https://github.com/floydwch/kaggle-cli). # # This is a screenscraper, so often needs to be updated to catch website updates. # # __To install/upgrade:__ # `pip install kaggle-cli --upgrade` # # __To get data:__ # `kg download -u [username] -p [password] -c [competition-name]` # # The competition name is the part after the main URL. # # E.g., in https://www.kaggle.com/c/planet-understanding-the-amazon-from-space, the competition name is _planet-understanding-the-amazon-from-space_. # # __Note: You must go to the competition in Kaggle and click download first, in order to accept the competition terms and conditions.__ # # ### Option 2: Chrome wget extension # Another option is to use [CurlWget](https://chrome.google.com/webstore/detail/curlwget/jmocjfidanebdlinpbcdkcmgdifblncg?hl=en), a Chrome extension. # # With this, simply click the download link to get what you want, and it will generate a `wget` command that you can paste into your Ubuntu console to download the data. # # This approach has the advantage that you can download just specific things. For example, instead of downloading all of the contest data, you can just download the folder containing JPG images. # # ### Note on symbolic links # In Ubuntu, you can create symbolic links (like shortcuts). For example, you can make the "data folder" in your Jupyter notebook directory point to a top level directory called data. # # To view symbolic links, use the `ls -l [directory]` command. E.g., `ls -l courses/dl1`. Symbolic links can be identified by `->`. # # # ## Dog Breeds # We will enter the [dog breed identification contest](https://www.kaggle.com/c/dog-breed-identification/). # # First get the data using the steps above. # In[1]: # Do our usual import and setup get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: # Set up our imports from fastai.imports import * from fastai.torch_imports import * from fastai.transforms import * from fastai.conv_learner import * from fastai.model import * from fastai.dataset import * from fastai.sgdr import * from fastai.plots import * # In[3]: # Set up our paths, size, architecture, batch size, etc. PATH="data/dogbreed/" size=224 architecture=resnext101_64 batch_size=58 # After doing some initial set up, we do some data exploration to see what the downloaded data looks like. # In[4]: label_csv = f'{PATH}labels.csv' # num of rows -1 (to account for the header) n = len(list(open(label_csv)))-1 # random 20% of rows to use as the validation set # get cross validation indexes val_idxs = get_cv_idxs(n) # In[5]: val_idxs # In[6]: # Take a look and see the files we have get_ipython().system('ls {PATH}') # In[7]: # Take a look inside labels using a pandas dataframe label_df = pd.read_csv(label_csv) label_df.head() # In[8]: label_df.count() # In[9]: # We then get the number of breeds label_df.pivot_table(index='breed', aggfunc=len).sort_values('id', ascending=False) # In[10]: # We're going to use data augmentation, so let's set up our transforms transforms = tfms_from_model(architecture, size, aug_tfms=transforms_side_on, max_zoom=1.1) # This time we're reading from a CSV instead of from a path # We also use the validation indexes we generated # Because the file names in the id column of the csv don't have jpg on the end, # we pass in .jpeg as a suffix as well. data = ImageClassifierData.from_csv(PATH, 'train', f'{PATH}labels.csv', test_name='test', val_idxs=val_idxs, suffix='.jpg', tfms=transforms, bs=batch_size) # In[11]: # Let's check out a file file_name = PATH+data.trn_ds.fnames[0] file_name # In[12]: img = PIL.Image.open(file_name) img # In[13]: img.size # In[14]: size_d = {k: PIL.Image.open(PATH+k).size for k in data.trn_ds.fnames} # In[15]: # Take a look at all the file sizes size_d # In[16]: # Get the row size and column size from the list above row_size, column_size = list(zip(*size_d.values())) # In[17]: row_size=np.array(row_size) column_size=np.array(column_size) # In[18]: row_size[:5] # In[19]: column_size[:5] # In[20]: # See how big most of our images are # Most are 500 wide plt.hist(row_size) # In[21]: # We can take out some of the outliers to get a more detailed view plt.hist(row_size[row_size<1000]) # We could do the same for the columns. This is just to make sure we don't have any super wide/big images, etc. # In[22]: # Get the size of the training and test data sets len(data.trn_ds), len(data.test_ds) # In[23]: # Get the num of classes len(data.classes), data.classes[:5] # ## Setting up the model # Now that we've done some data exploration, we can set up the model. # In[24]: # Convenience method def get_data(size, batch_size): transforms = tfms_from_model(architecture, size, aug_tfms=transforms_side_on, max_zoom=1.1) data = ImageClassifierData.from_csv(PATH, 'train', f'{PATH}labels.csv', test_name='test', num_workers=4, val_idxs=val_idxs, suffix='.jpg', tfms=transforms, bs=batch_size) return data if size>300 else data.resize(340, 'tmp') # ### Precompute # Set precompute to do and do our initial training for the final layers. # In[25]: data = get_data(size, batch_size) # In[26]: learn = ConvLearner.pretrained(architecture, data, precompute=True) # In[27]: learn.fit(1e-2, 5) # ### Augment # Next we turn precompute off so we can use data augmentations (transformations). # In[28]: from sklearn import metrics # In[29]: data = get_data(size, batch_size) # In[30]: learn = ConvLearner.pretrained(architecture, data, precompute=True, ps=0.5) # In[31]: learn.fit(1e-2, 2) # In[32]: learn.precompute=False # In[33]: # Use the learning rate finder to check the learning rate is optimal # Tried 5 epochs to check the accuracy was getting better, then stuck with that learn.fit(1e-2, 5, cycle_len=1) # In[34]: # Save our weights learn.save('224_pre') # In[35]: learn.load('224_pre') # ### Increase size # Then we increase the size of the images. # # A tip Jeremy has is to train on smaller images, then train on larger images, so the images are different. It prevents overfitting. Also, training on the small images is faster, so do it first. # In[36]: learn.set_data(get_data(299, batch_size)) learn.freeze() # In[37]: # If the training loss is greater than the validation loss, we're underfiting # We can up the cycle_mult parameter to correct this # (We'd do 3 epochs normally) learn.fit(1e-2, 1, cycle_len=1, cycle_mult=2) # In[39]: # Then try using test time augmentation to see if that gives better results log_preds, y = learn.TTA() # In[40]: probs = np.mean(np.exp(log_preds), 0) # In[42]: accuracy_np(probs, y) # We didn't unfreeze and try training more. This data is from image-net, that tuning the layers that were already pretrained on image-net anyway didn't make any difference. # ## Create the submission # Now that we've trained our model, we want to submit it to Kaggle. In the evaluation section of the competition, you can see the requested format: # # `id,affenpinscher,afghan_hound,..,yorkshire_terrier # 000621fb3cbb32d8935728e48679680e,0.0083,0.0,...,0.0083 # etc.` # In[43]: submission_csv = f'{PATH}sample_submission.csv' submission_df = pd.read_csv(submission_csv) submission_df.head() # In[44]: # Take a look at the different classes we have. # They can be accessed from our data object. data.classes # In[45]: # We can also see all of the file names data.test_ds.fnames # In[47]: # Get predictions on the test set # (We can't get accuracy since by definition we don't know) log_preds, y = learn.TTA(is_test=True) probs = np.mean(np.exp(log_preds), 0) # In[48]: # 10357 images, 120 possible breeds # Think of it like a matri -- a list of images, then a probability for the breeds probs.shape # In[49]: df = pd.DataFrame(probs) df.columns = data.classes # In[50]: # Insert the IDs, but remove the test/ before the file name ([5:-4]) df.insert(0, 'id', [o[5:-4] for o in data.test_ds.fnames]) # In[51]: # Now we have the data in our submission format df.head() # In[52]: # Then we just save it to a file submission = f'{PATH}subm/' os.makedirs(submission, exist_ok=True) df.to_csv(f'{submission}submission.gz', compression='gzip', index=False) # In[53]: # Get a link to the file to download FileLink(f'{submission}submission.gz') # # Predicting individual images # Sometimes you might want to predict just one image. # In[54]: file_name = data.val_ds.fnames[0] # In[55]: file_name # In[56]: Image.open(PATH+file_name).resize((150, 150)) # In[57]: trn_tfms,val_tfms = tfms_from_model(architecture, size) # In[61]: # Prediction expects a mini-batch -- a collection, we by indexing with # im[None] we essentially create a mini-batch just containing that one image im = val_tfms(Image.open(PATH+file_name)) preds = learn.predict_array(im[None]) np.argmax(preds) # In[ ]: