In [ ]:
# # For cloud-vm Jupyter lab where I dont have easy control over width yet
# # jupyter full-width cells https://github.com/jupyter/notebook/issues/1909#issuecomment-266116532
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

Download and clean data for the fighterjet dataset.


2018-12-03 17:14:06

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
In [2]:
from fastai import *
from fastai.vision import *
from fastai.widgets import *; __version__
Out[2]:
'1.0.32'
In [3]:
path = Path('data/aircraft')
In [ ]:
# ! mv fighterjet-failed-links.txt {path}/
# ! mv fighterjet-urls/ {path}/
In [ ]:
# path = Config.data_path()/'aircraft'; path.mkdir(parents=True, exist_ok=True) # set & create data directory
# ! cp -r fighterjet-urls {path}/ # copy urls to data directory
In [4]:
urls = path/'fighterjet-urls'
In [5]:
urls.ls()
Out[5]:
[PosixPath('data/aircraft/fighterjet-urls/tornado.txt'),
 PosixPath('data/aircraft/fighterjet-urls/f35.txt'),
 PosixPath('data/aircraft/fighterjet-urls/su57.txt'),
 PosixPath('data/aircraft/fighterjet-urls/f22.txt'),
 PosixPath('data/aircraft/fighterjet-urls/f4.txt'),
 PosixPath('data/aircraft/fighterjet-urls/mig29.txt'),
 PosixPath('data/aircraft/fighterjet-urls/typhoon.txt'),
 PosixPath('data/aircraft/fighterjet-urls/jas39.txt'),
 PosixPath('data/aircraft/fighterjet-urls/su34.txt'),
 PosixPath('data/aircraft/fighterjet-urls/su25.txt'),
 PosixPath('data/aircraft/fighterjet-urls/su30.txt'),
 PosixPath('data/aircraft/fighterjet-urls/su24.txt'),
 PosixPath('data/aircraft/fighterjet-urls/su27.txt'),
 PosixPath('data/aircraft/fighterjet-urls/su17.txt'),
 PosixPath('data/aircraft/fighterjet-urls/f18e.txt'),
 PosixPath('data/aircraft/fighterjet-urls/f15c.txt'),
 PosixPath('data/aircraft/fighterjet-urls/f18c.txt'),
 PosixPath('data/aircraft/fighterjet-urls/f15e.txt'),
 PosixPath('data/aircraft/fighterjet-urls/mig25.txt'),
 PosixPath('data/aircraft/fighterjet-urls/mig31.txt'),
 PosixPath('data/aircraft/fighterjet-urls/f14.txt'),
 PosixPath('data/aircraft/fighterjet-urls/f16.txt'),
 PosixPath('data/aircraft/fighterjet-urls/mig27.txt'),
 PosixPath('data/aircraft/fighterjet-urls/mig23.txt'),
 PosixPath('data/aircraft/fighterjet-urls/rafale.txt'),
 PosixPath('data/aircraft/fighterjet-urls/j20.txt'),
 PosixPath('data/aircraft/fighterjet-urls/mig21.txt')]

1. download dataset

In [32]:
# # download dataset
# for url_path in urls.ls():
#     aircraft_type = url_path.name.split('.')[0] # get class name
#     print(f'downloading: {aircraft_type}')
#     dest = path/aircraft_type; dest.mkdir(parents=True, exist_ok=True) # set & create class folder
#     download_images(url_path, dest)

download and preserve url filenames. -- this makes it a lot easier to remove links for images you don't want in the dataset.

If you're saving the filename from the url, you also need to convert from utf8-encoded bytes to text. See: https://stackoverflow.com/a/16566128

Unfortunately, I noticed something else. The filenames come from entire urls... it's not all too uncommon for links to have the same filename. In which case the image will just be overwritten. Even if that's not the case; it feels like a better-engineered solution would be to keep a dictionary mapping file interger number to url.

I don't really know how to do that in a callback yet. What I can do instead is have a dictionary as a global variable and write to it.

I also editted download_image to try to download an image url 5 times before continuing on. This is to catch links that work but not instantly 100% of the time.

Now when this is done, I can copy the actual broken links to the failed links file and clear them from the url lists as before; then go into macOS's Finder and manually remove images that don't fit.

Then I can remove the urls corresponding to filenames that are in the dictionary mapping (ie: they were downloaded) but not in their folders (I removed them).

This doesn't handle misclassed images, but honestly with hundreds per class, it doesn't really matter if I just delete them. The work to move them and then update the move in the url files is a bit too much.

# looks like fastai has a url-to-name function too:
def url2name(url): return url.split('/')[-1]
In [34]:
td = {}
td
Out[34]:
{}
In [36]:
c = 'f22'
if c not in td.keys(): td[c] = {}
td
Out[36]:
{'f22': {}}
In [37]:
td[c]['name'] = 'url'
In [38]:
td
Out[38]:
{'f22': {'name': 'url'}}

the below code doesn't need to look as complicated as it does -- after a lot of iterations I finally found a simple solution that works at full speed: print out the filename and url 😅.

In [7]:
# you could just run `fastai.data.download_image` in a big loop and give it the 
# destination filepath yourself; this way adapts fastai's parrallelized method
# to name files by their url filename instead of sequential integers.

# from urllib.parse import unquote # for decoding utf8 bytes

class ImageDownloader(object):
    """A class to download images and hold on to their filename-url mappings."""
    
    def __init__(self):
#         self.url_fname_dict = {}
        self.clas = 'N/A'
#         self.failed_downloads = []
        
    def download_image(self, url,dest, timeout=4):
        # many images work fine but arent downloading on the 1st try;
        # maybe trying multiple times will work
        # NOTE: saving to dict will not working if using multiple processes
        for i in range(5):
            try: 
                r = download_url(url, dest, overwrite=True, show_progress=False, timeout=timeout)
#                 self.url_fname_dict[self.clas][dest.name] = url # {filename:url}
                print(f'saved: {dest} - {url}') # a much simpler solution
                break
            except Exception as e: 
                if i == 4:
#                     self.failed_downloads.append(url)
                    print(f"Error {url} {e}")
                else: continue

    def _download_image_inner_2(self, dest, url, i, timeout=4):
    #     url = unquote(url) # decode utf8 bytes
        suffix = re.findall(r'\.\w+?(?=(?:\?|$))', url)
        suffix = suffix[0] if len(suffix)>0  else '.jpg'
    #     fname = url.split('/')[-1].split(suffix)[0]
    #     download_image(url, dest/f"{fname}{suffix}", timeout=timeout)
        self.download_image(url, dest/f"{i:08d}{suffix}", timeout=timeout)

    def download_images_2(self, urls:Collection[str], dest:PathOrStr, max_pics:int=1000, max_workers:int=8, timeout=4):
        "Download images listed in text file `urls` to path `dest`, at most `max_pics`"
#         if self.clas not in self.url_fname_dict.keys(): self.url_fname_dict[self.clas] = {} # this line is apparently overwriting the dict at each step
        urls = open(urls).read().strip().split("\n")[:max_pics]
        dest = Path(dest)
        dest.mkdir(exist_ok=True)
        parallel(partial(self._download_image_inner_2, dest, timeout=timeout), urls, max_workers=max_workers)
        
In [8]:
# example of what you have to do for saving url filenames that are utf8 encoded
from urllib.parse import unquote
unquote('https://upload.wikimedia.org/wikipedia/commons/0/02/%D0%9A%D1%83%D1%80%D1%81%D0%B0%D0%BD%D1%82%D0%B8_%D0%A5%D0%B0%D1%80%D0%BA%D1%96%D0%B2%D1%81%D1%8C%D0%BA%D0%BE%D0%B3%D0%BE_%D1%83%D0%BD%D1%96%D0%B2%D0%B5%D1%80%D1%81%D0%B8%D1%82%D0%B5%D1%82%D1%83_%D0%9F%D0%BE%D0%B2%D1%96%D1%82%D1%80%D1%8F%D0%BD%D0%B8%D1%85_%D0%A1%D0%B8%D0%BB_%D0%BF%D1%80%D0%B8%D1%81%D1%82%D1%83%D0%BF%D0%B8%D0%BB%D0%B8_%D0%B4%D0%BE_%D0%BF%D0%BE%D0%BB%D1%8C%D0%BE%D1%82%D1%96%D0%B2_%D0%BD%D0%B0_%D0%B1%D0%BE%D0%B9%D0%BE%D0%B2%D0%B8%D1%85_%D0%BB%D1%96%D1%82%D0%B0%D0%BA%D0%B0%D1%85_%D0%A1%D1%83-25_%D1%82%D0%B0_%D0%9C%D1%96%D0%B3-29.jpg')
Out[8]:
'https://upload.wikimedia.org/wikipedia/commons/0/02/Курсанти_Харківського_університету_Повітряних_Сил_приступили_до_польотів_на_бойових_літаках_Су-25_та_Міг-29.jpg'
In [ ]:
# download dataset
downloader = ImageDownloader()
for url_path in urls.ls():
    aircraft_type = url_path.name.split('.')[0] # get class name
    downloader.clas = aircraft_type
    print(f'downloading: {aircraft_type}')
    dest = path/aircraft_type; dest.mkdir(parents=True, exist_ok=True) # set & create class folder
    downloader.download_images_2(url_path, dest)

So I learned that when you set the number of processes via max_workers greater than 1, you're not able to write anything to a dictionary. This may be intended behavior given this stackoverflow thread I mentioned here. If its -1, 0, or 1, then you're good to go.

Unfortunately you don't get the cool blue progress bar in that case.

Also. This will take all night. Almost two hours in, the downloader's only gotten 10/27 classes in. There's a faster way to do it. If I were running a company how would I do this? Well if this was something that had to get done now, and wasn't necessarily going to be repeated -- or if getting it done this time was much more important: run multiple processes and just printout the successful downloads. Then run regex filters over the text to pull out the failures and successful mappings.

The great thing about this methos is (I think) you can run it from a terminal and save the output straight to a text file, then do the filter/cleaning operations off of that. That actually sounds good, and something I'd do in a company.

2018-12-04 10:41:45

This way actually worked perfectly, giving a printout of 10,311 lines.

In [117]:
len(downloader.url_fname_dict['tornado']) # max_workers -1, 0, or 1
Out[117]:
399
In [108]:
len(downloader.url_fname_dict['tornado']) # max_workers > 1
Out[108]:
0

2018-12-04 00:33:48 ; 2018-12-04 01:53:50

In [10]:
import re
from collections import defaultdict
In [11]:
# failed_links_path = path/'fighterjet-failed-links.txt' # copy-paste above download output to text file first
download_printout_path = path/'download-printout.txt'
In [15]:
fail_pat = re.compile(r'Error \S+') # split
clas_pat = re.compile(r'downloading: \S+') # split
save_pat = re.compile(r'data/\S+')
link_pat = re.compile(r'\s-\s\S+') # split

To test that it works, I'll save the output to a dictionary and count the number of links.

In [34]:
removal_urls = defaultdict(lambda:[])
In [66]:
file_mapping = defaultdict(lambda:{})
In [29]:
# with open(download_printout_path) as f:
#     for i,line in enumerate(f):
# #         aircraft_type = clas_pat.search(line).group(0).split()[-1] if clas_pat.search(line) else aircraft_type
#         aircraft_type = clas_pat.findall(line)
    
#         if clas_pat.findall(line): aircraft_type = clas_pat.findall(line)[0]
#         elif fail_pat.findall(line): fail_url   = fail_pat.findall(line)[0]
#         elif save_pat.findall(line) and link_pat.findall(line):
#                 save_path = save_pat.findall(line)[0]
#                 link      = link_pat.findall(line)[0]
    
#         print(aircraft_type)
#         if i == 10: break
        
downloading: tornado
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
In [72]:
with open(download_printout_path) as f:
    for line in f:
        # update class
        aircraft_type = clas_pat.findall(line)
        clas = aircraft_type[0].split()[-1] if aircraft_type else clas
        # search download path & url
        save,link = save_pat.findall(line), link_pat.findall(line)
        if save and link: 
            link = link[0].split(' - ')[-1]
            file_mapping[clas][save[0]] = link
        # search failed download url
        fail_link = fail_pat.findall(line)
        if fail_link: removal_urls[clas].append(fail_link[0])
In [73]:
file_mapping.keys()
Out[73]:
dict_keys(['tornado', 'f35', 'su57', 'f22', 'f4', 'mig29', 'typhoon', 'jas39', 'su34', 'su25', 'su30', 'su24', 'su27', 'su17', 'f18e', 'f15c', 'f18c', 'f15e', 'mig25', 'mig31', 'f14', 'f16', 'mig27', 'mig23', 'rafale', 'j20', 'mig21'])
In [75]:
len(removal_urls)
Out[75]:
8
In [84]:
print(f'{"class":<8} {"n":<5} {"removes"}\n{"–"*22}')
for k in file_mapping.keys():
    print(f'{k:<8} {len(file_mapping[k]):<5} {len(removal_urls[k])}')
class    n     removes
––––––––––––––––––––––
tornado  399   0
f35      97    0
su57     361   0
f22      388   3
f4       398   1
mig29    394   0
typhoon  395   0
jas39    387   1
su34     393   0
su25     391   0
su30     399   0
su24     388   0
su27     394   0
su17     389   1
f18e     391   0
f15c     396   0
f18c     393   0
f15e     394   0
mig25    390   0
mig31    389   2
f14      394   0
f16      393   0
mig27    387   1
mig23    394   2
rafale   394   0
j20      366   5
mig21    387   0

Now that I have the mapping; I can save the dicts to disk, do my 'visual inspection' and use them to clean the url files.

You can't serialize a defaultdict created with a lambda function, but I already have what I needed from the 'default' side, so I can just convert them to regular dictionaries (see here & discussion here):

In [86]:
torch.save(dict(file_mapping), path/'file_mapping.pkl')
torch.save(dict(removal_urls), path/'removal_urls.pkl')
In [26]:
# with open(download_printout_path) as f:
#     for i,line in enumerate(f):
# #         aircraft_type = clas_pat.search(line).group(0).split()[-1] if clas_pat.search(line) else aircraft_type
#         aircraft_type = clas_pat.findall(line)
    
#         if clas_pat.findall(line): aircraft_type = clas_pat.findall(line)[0]
#         elif: fail_pat.findall(line): fail_url   = fail_pat.findall(line)[0]
#         elif: save_pat.findall(line) and link_pat.findall(line):
#                 save_path = save_pat.findall(line)[0]
#                 link      = link_pat.findall(line)[0]
    
#         print(aircraft_type)
#         if i == 10: break
        
['downloading: tornado']
yi
[]
ni
[]
ni
[]
ni
[]
ni
[]
ni
[]
ni
[]
ni
[]
ni
[]
ni
[]
ni
In [15]:
# with open(download_printout_path) as f:
#     for line in f:
#         # run regex filters
#         aircraft_type = clas_pat.search(line).group(0).split()[-1] if clas_pat.search(line) else aircraft_type
#         fail = fail_pat.search(line)
#         save_path = save_pat.search(line).group(0)
#         link = link_pat.search(line).group(0).split()[-1] if link_pat.search(line) else None
        
        
#         # operations based on filters
#         if aircraft_type not in file_mapping.keys(): file_mapping[aircraft_type] = {}
#         if fail: removal_urls[aircraft_type].append(link.group(0).split()[-1])
In [18]:
# removal_urls[aircraft_type]
In [17]:
removal_urls.keys()
Out[17]:
dict_keys(['mig21', 'f16', 'tornado', 'f15e', 'su30', 'f15c', 'su27', 'su57', 'su17', 'f18c', 'mig29', 'mig31', 'f22', 'f18e', 'typhoon', 'j20', 'mig23', 'jas39', 'f14', 'su34', 'su24', 'f4', 'mig27', 'su25', 'rafale', 'mig25', 'f35'])
In [19]:
count = 0
for k in removal_urls.keys(): count += len(removal_urls[k])
count
Out[19]:
325

After checking and updating the code a bit; the only extra lines do not contain links or classes. Woo.

In [ ]:
path.ls()

Remove broken links from URL files:

In [20]:
for aircraft_type in removal_urls.keys():
    fpath = path/'fighterjet-urls'/(aircraft_type + '.txt')
    with open(fpath) as f: text_file = [line for line in f] # open file; read lines
    for i,line in enumerate(text_file):
        line = line.rstrip() # remove trailing /n for searching
        if line in removal_urls[aircraft_type]: text_file.pop(i) # remove line from text file
    with open(fpath, mode='wt') as f: # this deletes the original file *I think*: https://stackoverflow.com/a/11469328
        for line in text_file: f.write(line)

3. Verify downloads

Delete all corrupted downloads:

In [5]:
aircraft_types = [c.name.split('.')[0] for c in urls.ls()]
In [6]:
for c in aircraft_types:
    print(c)
    verify_images(path/c, delete=True, max_size=500)
tornado
100.00% [252/252 00:00<00:00]
f35
100.00% [81/81 00:00<00:00]
su57
100.00% [261/261 00:00<00:00]
f22
100.00% [307/307 00:00<00:00]
f4
100.00% [306/306 00:00<00:00]
Image data/aircraft/f4/00000386.gif has 1 instead of 3
Image data/aircraft/f4/00000225.gif has 1 instead of 3
mig29
100.00% [327/327 00:00<00:00]
typhoon
100.00% [314/314 00:00<00:00]
jas39
100.00% [246/246 00:00<00:00]
su34
100.00% [318/318 00:00<00:00]
su25
100.00% [241/241 00:00<00:00]
su30
100.00% [201/201 00:00<00:00]
su24
100.00% [245/245 00:00<00:00]
su27
100.00% [160/160 00:00<00:00]
su17
100.00% [126/126 00:00<00:00]
f18e
100.00% [246/246 00:00<00:00]
f15c
100.00% [195/195 00:00<00:00]
f18c
100.00% [260/260 00:00<00:00]
Image data/aircraft/f18c/00000193.gif has 1 instead of 3
f15e
100.00% [262/262 00:00<00:00]
mig25
100.00% [142/142 00:00<00:00]
mig31
100.00% [222/222 00:00<00:00]
f14
100.00% [298/298 00:00<00:00]
f16
100.00% [319/319 00:00<00:00]
mig27
100.00% [106/106 00:00<00:00]
mig23
100.00% [115/115 00:00<00:00]
rafale
100.00% [306/306 00:00<00:00]
Image data/aircraft/rafale/00000227.gif has 1 instead of 3
j20
100.00% [197/197 00:00<00:00]
mig21
100.00% [297/297 00:00<00:00]

4. Visual Inspection

Clean out the images that don't belong. This is done manually in the file explorer (faster than displaying in jupyter as I did the first time on this project).

I noticed I didn't do the mapping the best way. I should've done a {key: {key:val}} mapping of {class: {int_name: url}}. Instead I did {int_name: url}. This means I have to do a full lookup of every key:value pair in the dictionary for each class. This is not ideal.

Actually one additional mistake means I have to redo the whole download: I didn't save filepaths, I saved filenames only, as keys. This means there's no way tell which class a filename belongs to on the dictionary's side. In fact it's worse: because there are going to be at most n_classes identical copies of each filename... meaning the dictionary is useless because entries are just getting rewritten.

So this forces a chance to correct the original mistake.

5. Update urls

All this work was done on my Mac for 2 reasons: I'm not burning GCP credits, and I can review images fastest through macOS's GUI. With the dataset now fully cleaned, I need to transfer those changes to the remote machine. I'm not going to move the images because that won't scale. The dataset was originall 2.27 GB; 150MB after resizing to max(500x500) w/ the fastai image verifier, but still.

Instead I'm going to use the filename-url mapping I worked on creating earlier to find the images that are no longer in the dataset, and remove them from the url files. I already have the code to do the removals. All I need to do is update the file containing urls to remove.

In [10]:
# load urls to remove
removal_urls = torch.load(path/'removal_urls.pkl')
In [11]:
# update the values - since I forgot to remove the 'Error ' part:
for k in removal_urls.keys():
    removes = removal_urls[k]
    # cut off the 'Error ' part
    for i in range(len(removes)):
        removes[i] = removes[i].split('Error ')[-1]
In [13]:
# load filename-url mappings. {class : {filepath : url}}
file_mapping = torch.load(path/'file_mapping.pkl')
In [19]:
path
Out[19]:
PosixPath('data/aircraft')
In [36]:
flist[:5]
Out[36]:
[PosixPath('data/aircraft/mig21/00000366.JPG'),
 PosixPath('data/aircraft/mig21/00000158.jpg'),
 PosixPath('data/aircraft/mig21/00000170.jpg'),
 PosixPath('data/aircraft/mig21/00000038.jpg'),
 PosixPath('data/aircraft/mig21/00000010.jpg')]
In [38]:
removal_urls['f22']
Out[38]:
['https://www.lockheedmartin.com/content/dam/lockheed-martin/aero/photo/f22/f-22.jpg.pc-adaptive.full.medium.jpeg',
 'https://www.lockheedmartin.com/content/dam/lockheed-martin/aero/photo/f22/F-22%20Speedline%20aircraft_10-31-2016_(Lockheed%20Martin%20photo%20by%20Andrew%20McMurtrie).jpg.pc-adaptive.full.medium.',
 'https://www.lockheedmartin.com/content/dam/lockheed-martin/aero/photo/f22/F-22-Squadron.png']
In [42]:
# run through directory, lookup urls of missing files in file_mapping & add to removal_urls
for clas in aircraft_types:
    flist = (path/clas).ls() # pull all filepaths in class folder
    # I keep getting ideas about better ways to do this; which is great, 
    # but for now, the focus is just to get it done. ie dict lookups vs array searches
    for fpath in file_mapping[clas].keys():
        if Path(fpath) not in flist:# remember flist consists of Posix-paths, not strings
            removal_urls[clas].append(file_mapping[clas][fpath])
In [44]:
# remove links from the url files
for aircraft_type in removal_urls.keys():
    fpath = path/'fighterjet-urls'/(aircraft_type + '.txt')
    with open(fpath) as f: text_file = [line for line in f] # open file; read lines
    for i,line in enumerate(text_file):
        line = line.rstrip() # remove trailing /n for searching
        if line in removal_urls[aircraft_type]: text_file.pop(i) # remove line from text file
    with open(fpath, mode='wt') as f: # this deletes the original file *I think*: https://stackoverflow.com/a/11469328
        for line in text_file: f.write(line)
In [47]:
# add contents of removal_urls to the master broken links file
with open(path/'fighterjet-failed-links.txt', mode='a') as f:
    for c in removal_urls.keys():
        f.writelines(f'{c}\n')
        for line in removal_urls[c]:
            f.writelines(f'{line}\n')
In [48]:
# save removal_urls to disk (not sure if I'll keep this or the other file)
torch.save(removal_urls, path/'removal_urls.pkl')

The filemapping is no longer relevant since the images will be redownloaded on the other machine, and will have new mappings.

In [52]:
tot = 0
for clas in aircraft_types: tot += len((path/clas).ls())
tot
Out[52]:
6373

The original dataset size was 10,241 images, this's been cleaned down to 6,373.

In [ ]: