# # For cloud-vm Jupyter lab where I dont have easy control over width yet
# # jupyter full-width cells https://github.com/jupyter/notebook/issues/1909#issuecomment-266116532
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
%reload_ext autoreload
%autoreload 2
from fastai import *
from fastai.vision import *
from fastai.widgets import *; __version__
'1.0.32'
path = Path('data/aircraft')
# ! mv fighterjet-failed-links.txt {path}/
# ! mv fighterjet-urls/ {path}/
# path = Config.data_path()/'aircraft'; path.mkdir(parents=True, exist_ok=True) # set & create data directory
# ! cp -r fighterjet-urls {path}/ # copy urls to data directory
urls = path/'fighterjet-urls'
urls.ls()
[PosixPath('data/aircraft/fighterjet-urls/tornado.txt'), PosixPath('data/aircraft/fighterjet-urls/f35.txt'), PosixPath('data/aircraft/fighterjet-urls/su57.txt'), PosixPath('data/aircraft/fighterjet-urls/f22.txt'), PosixPath('data/aircraft/fighterjet-urls/f4.txt'), PosixPath('data/aircraft/fighterjet-urls/mig29.txt'), PosixPath('data/aircraft/fighterjet-urls/typhoon.txt'), PosixPath('data/aircraft/fighterjet-urls/jas39.txt'), PosixPath('data/aircraft/fighterjet-urls/su34.txt'), PosixPath('data/aircraft/fighterjet-urls/su25.txt'), PosixPath('data/aircraft/fighterjet-urls/su30.txt'), PosixPath('data/aircraft/fighterjet-urls/su24.txt'), PosixPath('data/aircraft/fighterjet-urls/su27.txt'), PosixPath('data/aircraft/fighterjet-urls/su17.txt'), PosixPath('data/aircraft/fighterjet-urls/f18e.txt'), PosixPath('data/aircraft/fighterjet-urls/f15c.txt'), PosixPath('data/aircraft/fighterjet-urls/f18c.txt'), PosixPath('data/aircraft/fighterjet-urls/f15e.txt'), PosixPath('data/aircraft/fighterjet-urls/mig25.txt'), PosixPath('data/aircraft/fighterjet-urls/mig31.txt'), PosixPath('data/aircraft/fighterjet-urls/f14.txt'), PosixPath('data/aircraft/fighterjet-urls/f16.txt'), PosixPath('data/aircraft/fighterjet-urls/mig27.txt'), PosixPath('data/aircraft/fighterjet-urls/mig23.txt'), PosixPath('data/aircraft/fighterjet-urls/rafale.txt'), PosixPath('data/aircraft/fighterjet-urls/j20.txt'), PosixPath('data/aircraft/fighterjet-urls/mig21.txt')]
# # download dataset
# for url_path in urls.ls():
# aircraft_type = url_path.name.split('.')[0] # get class name
# print(f'downloading: {aircraft_type}')
# dest = path/aircraft_type; dest.mkdir(parents=True, exist_ok=True) # set & create class folder
# download_images(url_path, dest)
download and preserve url filenames. -- this makes it a lot easier to remove links for images you don't want in the dataset.
If you're saving the filename from the url, you also need to convert from utf8-encoded bytes to text. See: https://stackoverflow.com/a/16566128
Unfortunately, I noticed something else. The filenames come from entire urls... it's not all too uncommon for links to have the same filename. In which case the image will just be overwritten. Even if that's not the case; it feels like a better-engineered solution would be to keep a dictionary mapping file interger number to url.
I don't really know how to do that in a callback yet. What I can do instead is have a dictionary as a global variable and write to it.
I also editted download_image
to try to download an image url 5 times before continuing on. This is to catch links that work but not instantly 100% of the time.
Now when this is done, I can copy the actual broken links to the failed links file and clear them from the url lists as before; then go into macOS's Finder and manually remove images that don't fit.
Then I can remove the urls corresponding to filenames that are in the dictionary mapping (ie: they were downloaded) but not in their folders (I removed them).
This doesn't handle misclassed images, but honestly with hundreds per class, it doesn't really matter if I just delete them. The work to move them and then update the move in the url files is a bit too much.
# looks like fastai has a url-to-name function too:
def url2name(url): return url.split('/')[-1]
td = {}
td
{}
c = 'f22'
if c not in td.keys(): td[c] = {}
td
{'f22': {}}
td[c]['name'] = 'url'
td
{'f22': {'name': 'url'}}
the below code doesn't need to look as complicated as it does -- after a lot of iterations I finally found a simple solution that works at full speed: print out the filename and url 😅.
# you could just run `fastai.data.download_image` in a big loop and give it the
# destination filepath yourself; this way adapts fastai's parrallelized method
# to name files by their url filename instead of sequential integers.
# from urllib.parse import unquote # for decoding utf8 bytes
class ImageDownloader(object):
"""A class to download images and hold on to their filename-url mappings."""
def __init__(self):
# self.url_fname_dict = {}
self.clas = 'N/A'
# self.failed_downloads = []
def download_image(self, url,dest, timeout=4):
# many images work fine but arent downloading on the 1st try;
# maybe trying multiple times will work
# NOTE: saving to dict will not working if using multiple processes
for i in range(5):
try:
r = download_url(url, dest, overwrite=True, show_progress=False, timeout=timeout)
# self.url_fname_dict[self.clas][dest.name] = url # {filename:url}
print(f'saved: {dest} - {url}') # a much simpler solution
break
except Exception as e:
if i == 4:
# self.failed_downloads.append(url)
print(f"Error {url} {e}")
else: continue
def _download_image_inner_2(self, dest, url, i, timeout=4):
# url = unquote(url) # decode utf8 bytes
suffix = re.findall(r'\.\w+?(?=(?:\?|$))', url)
suffix = suffix[0] if len(suffix)>0 else '.jpg'
# fname = url.split('/')[-1].split(suffix)[0]
# download_image(url, dest/f"{fname}{suffix}", timeout=timeout)
self.download_image(url, dest/f"{i:08d}{suffix}", timeout=timeout)
def download_images_2(self, urls:Collection[str], dest:PathOrStr, max_pics:int=1000, max_workers:int=8, timeout=4):
"Download images listed in text file `urls` to path `dest`, at most `max_pics`"
# if self.clas not in self.url_fname_dict.keys(): self.url_fname_dict[self.clas] = {} # this line is apparently overwriting the dict at each step
urls = open(urls).read().strip().split("\n")[:max_pics]
dest = Path(dest)
dest.mkdir(exist_ok=True)
parallel(partial(self._download_image_inner_2, dest, timeout=timeout), urls, max_workers=max_workers)
# example of what you have to do for saving url filenames that are utf8 encoded
from urllib.parse import unquote
unquote('https://upload.wikimedia.org/wikipedia/commons/0/02/%D0%9A%D1%83%D1%80%D1%81%D0%B0%D0%BD%D1%82%D0%B8_%D0%A5%D0%B0%D1%80%D0%BA%D1%96%D0%B2%D1%81%D1%8C%D0%BA%D0%BE%D0%B3%D0%BE_%D1%83%D0%BD%D1%96%D0%B2%D0%B5%D1%80%D1%81%D0%B8%D1%82%D0%B5%D1%82%D1%83_%D0%9F%D0%BE%D0%B2%D1%96%D1%82%D1%80%D1%8F%D0%BD%D0%B8%D1%85_%D0%A1%D0%B8%D0%BB_%D0%BF%D1%80%D0%B8%D1%81%D1%82%D1%83%D0%BF%D0%B8%D0%BB%D0%B8_%D0%B4%D0%BE_%D0%BF%D0%BE%D0%BB%D1%8C%D0%BE%D1%82%D1%96%D0%B2_%D0%BD%D0%B0_%D0%B1%D0%BE%D0%B9%D0%BE%D0%B2%D0%B8%D1%85_%D0%BB%D1%96%D1%82%D0%B0%D0%BA%D0%B0%D1%85_%D0%A1%D1%83-25_%D1%82%D0%B0_%D0%9C%D1%96%D0%B3-29.jpg')
'https://upload.wikimedia.org/wikipedia/commons/0/02/Курсанти_Харківського_університету_Повітряних_Сил_приступили_до_польотів_на_бойових_літаках_Су-25_та_Міг-29.jpg'
# download dataset
downloader = ImageDownloader()
for url_path in urls.ls():
aircraft_type = url_path.name.split('.')[0] # get class name
downloader.clas = aircraft_type
print(f'downloading: {aircraft_type}')
dest = path/aircraft_type; dest.mkdir(parents=True, exist_ok=True) # set & create class folder
downloader.download_images_2(url_path, dest)
So I learned that when you set the number of processes via max_workers
greater than 1, you're not able to write anything to a dictionary. This may be intended behavior given this stackoverflow thread I mentioned here. If its -1, 0, or 1, then you're good to go.
Unfortunately you don't get the cool blue progress bar in that case.
Also. This will take all night. Almost two hours in, the downloader's only gotten 10/27 classes in. There's a faster way to do it. If I were running a company how would I do this? Well if this was something that had to get done now, and wasn't necessarily going to be repeated -- or if getting it done this time was much more important: run multiple processes and just printout the successful downloads. Then run regex filters over the text to pull out the failures and successful mappings.
The great thing about this methos is (I think) you can run it from a terminal and save the output straight to a text file, then do the filter/cleaning operations off of that. That actually sounds good, and something I'd do in a company.
2018-12-04 10:41:45
This way actually worked perfectly, giving a printout of 10,311 lines.
len(downloader.url_fname_dict['tornado']) # max_workers -1, 0, or 1
399
len(downloader.url_fname_dict['tornado']) # max_workers > 1
0
2018-12-04 00:33:48 ; 2018-12-04 01:53:50
import re
from collections import defaultdict
# failed_links_path = path/'fighterjet-failed-links.txt' # copy-paste above download output to text file first
download_printout_path = path/'download-printout.txt'
fail_pat = re.compile(r'Error \S+') # split
clas_pat = re.compile(r'downloading: \S+') # split
save_pat = re.compile(r'data/\S+')
link_pat = re.compile(r'\s-\s\S+') # split
To test that it works, I'll save the output to a dictionary and count the number of links.
removal_urls = defaultdict(lambda:[])
file_mapping = defaultdict(lambda:{})
# with open(download_printout_path) as f:
# for i,line in enumerate(f):
# # aircraft_type = clas_pat.search(line).group(0).split()[-1] if clas_pat.search(line) else aircraft_type
# aircraft_type = clas_pat.findall(line)
# if clas_pat.findall(line): aircraft_type = clas_pat.findall(line)[0]
# elif fail_pat.findall(line): fail_url = fail_pat.findall(line)[0]
# elif save_pat.findall(line) and link_pat.findall(line):
# save_path = save_pat.findall(line)[0]
# link = link_pat.findall(line)[0]
# print(aircraft_type)
# if i == 10: break
downloading: tornado [] [] [] [] [] [] [] [] [] []
with open(download_printout_path) as f:
for line in f:
# update class
aircraft_type = clas_pat.findall(line)
clas = aircraft_type[0].split()[-1] if aircraft_type else clas
# search download path & url
save,link = save_pat.findall(line), link_pat.findall(line)
if save and link:
link = link[0].split(' - ')[-1]
file_mapping[clas][save[0]] = link
# search failed download url
fail_link = fail_pat.findall(line)
if fail_link: removal_urls[clas].append(fail_link[0])
file_mapping.keys()
dict_keys(['tornado', 'f35', 'su57', 'f22', 'f4', 'mig29', 'typhoon', 'jas39', 'su34', 'su25', 'su30', 'su24', 'su27', 'su17', 'f18e', 'f15c', 'f18c', 'f15e', 'mig25', 'mig31', 'f14', 'f16', 'mig27', 'mig23', 'rafale', 'j20', 'mig21'])
len(removal_urls)
8
print(f'{"class":<8} {"n":<5} {"removes"}\n{"–"*22}')
for k in file_mapping.keys():
print(f'{k:<8} {len(file_mapping[k]):<5} {len(removal_urls[k])}')
class n removes –––––––––––––––––––––– tornado 399 0 f35 97 0 su57 361 0 f22 388 3 f4 398 1 mig29 394 0 typhoon 395 0 jas39 387 1 su34 393 0 su25 391 0 su30 399 0 su24 388 0 su27 394 0 su17 389 1 f18e 391 0 f15c 396 0 f18c 393 0 f15e 394 0 mig25 390 0 mig31 389 2 f14 394 0 f16 393 0 mig27 387 1 mig23 394 2 rafale 394 0 j20 366 5 mig21 387 0
Now that I have the mapping; I can save the dicts to disk, do my 'visual inspection' and use them to clean the url files.
You can't serialize a defaultdict
created with a lambda function, but I already have what I needed from the 'default' side, so I can just convert them to regular dictionaries (see here & discussion here):
torch.save(dict(file_mapping), path/'file_mapping.pkl')
torch.save(dict(removal_urls), path/'removal_urls.pkl')
# with open(download_printout_path) as f:
# for i,line in enumerate(f):
# # aircraft_type = clas_pat.search(line).group(0).split()[-1] if clas_pat.search(line) else aircraft_type
# aircraft_type = clas_pat.findall(line)
# if clas_pat.findall(line): aircraft_type = clas_pat.findall(line)[0]
# elif: fail_pat.findall(line): fail_url = fail_pat.findall(line)[0]
# elif: save_pat.findall(line) and link_pat.findall(line):
# save_path = save_pat.findall(line)[0]
# link = link_pat.findall(line)[0]
# print(aircraft_type)
# if i == 10: break
['downloading: tornado'] yi [] ni [] ni [] ni [] ni [] ni [] ni [] ni [] ni [] ni [] ni
# with open(download_printout_path) as f:
# for line in f:
# # run regex filters
# aircraft_type = clas_pat.search(line).group(0).split()[-1] if clas_pat.search(line) else aircraft_type
# fail = fail_pat.search(line)
# save_path = save_pat.search(line).group(0)
# link = link_pat.search(line).group(0).split()[-1] if link_pat.search(line) else None
# # operations based on filters
# if aircraft_type not in file_mapping.keys(): file_mapping[aircraft_type] = {}
# if fail: removal_urls[aircraft_type].append(link.group(0).split()[-1])
# removal_urls[aircraft_type]
removal_urls.keys()
dict_keys(['mig21', 'f16', 'tornado', 'f15e', 'su30', 'f15c', 'su27', 'su57', 'su17', 'f18c', 'mig29', 'mig31', 'f22', 'f18e', 'typhoon', 'j20', 'mig23', 'jas39', 'f14', 'su34', 'su24', 'f4', 'mig27', 'su25', 'rafale', 'mig25', 'f35'])
count = 0
for k in removal_urls.keys(): count += len(removal_urls[k])
count
325
After checking and updating the code a bit; the only extra lines do not contain links or classes. Woo.
path.ls()
Remove broken links from URL files:
for aircraft_type in removal_urls.keys():
fpath = path/'fighterjet-urls'/(aircraft_type + '.txt')
with open(fpath) as f: text_file = [line for line in f] # open file; read lines
for i,line in enumerate(text_file):
line = line.rstrip() # remove trailing /n for searching
if line in removal_urls[aircraft_type]: text_file.pop(i) # remove line from text file
with open(fpath, mode='wt') as f: # this deletes the original file *I think*: https://stackoverflow.com/a/11469328
for line in text_file: f.write(line)
Delete all corrupted downloads:
aircraft_types = [c.name.split('.')[0] for c in urls.ls()]
for c in aircraft_types:
print(c)
verify_images(path/c, delete=True, max_size=500)
tornado
f35
su57
f22
f4
Image data/aircraft/f4/00000386.gif has 1 instead of 3 Image data/aircraft/f4/00000225.gif has 1 instead of 3 mig29
typhoon
jas39
su34
su25
su30
su24
su27
su17
f18e
f15c
f18c
Image data/aircraft/f18c/00000193.gif has 1 instead of 3 f15e
mig25
mig31
f14
f16
mig27
mig23
rafale
Image data/aircraft/rafale/00000227.gif has 1 instead of 3 j20
mig21
Clean out the images that don't belong. This is done manually in the file explorer (faster than displaying in jupyter as I did the first time on this project).
I noticed I didn't do the mapping the best way. I should've done a {key: {key:val}} mapping of {class: {int_name: url}}. Instead I did {int_name: url}. This means I have to do a full lookup of every key:value pair in the dictionary for each class. This is not ideal.
Actually one additional mistake means I have to redo the whole download: I didn't save filepaths, I saved filenames only, as keys. This means there's no way tell which class a filename belongs to on the dictionary's side. In fact it's worse: because there are going to be at most n_classes
identical copies of each filename... meaning the dictionary is useless because entries are just getting rewritten.
So this forces a chance to correct the original mistake.
All this work was done on my Mac for 2 reasons: I'm not burning GCP credits, and I can review images fastest through macOS's GUI. With the dataset now fully cleaned, I need to transfer those changes to the remote machine. I'm not going to move the images because that won't scale. The dataset was originall 2.27 GB; 150MB after resizing to max(500x500) w/ the fastai image verifier, but still.
Instead I'm going to use the filename-url mapping I worked on creating earlier to find the images that are no longer in the dataset, and remove them from the url files. I already have the code to do the removals. All I need to do is update the file containing urls to remove.
# load urls to remove
removal_urls = torch.load(path/'removal_urls.pkl')
# update the values - since I forgot to remove the 'Error ' part:
for k in removal_urls.keys():
removes = removal_urls[k]
# cut off the 'Error ' part
for i in range(len(removes)):
removes[i] = removes[i].split('Error ')[-1]
# load filename-url mappings. {class : {filepath : url}}
file_mapping = torch.load(path/'file_mapping.pkl')
path
PosixPath('data/aircraft')
flist[:5]
[PosixPath('data/aircraft/mig21/00000366.JPG'), PosixPath('data/aircraft/mig21/00000158.jpg'), PosixPath('data/aircraft/mig21/00000170.jpg'), PosixPath('data/aircraft/mig21/00000038.jpg'), PosixPath('data/aircraft/mig21/00000010.jpg')]
removal_urls['f22']
['https://www.lockheedmartin.com/content/dam/lockheed-martin/aero/photo/f22/f-22.jpg.pc-adaptive.full.medium.jpeg', 'https://www.lockheedmartin.com/content/dam/lockheed-martin/aero/photo/f22/F-22%20Speedline%20aircraft_10-31-2016_(Lockheed%20Martin%20photo%20by%20Andrew%20McMurtrie).jpg.pc-adaptive.full.medium.', 'https://www.lockheedmartin.com/content/dam/lockheed-martin/aero/photo/f22/F-22-Squadron.png']
# run through directory, lookup urls of missing files in file_mapping & add to removal_urls
for clas in aircraft_types:
flist = (path/clas).ls() # pull all filepaths in class folder
# I keep getting ideas about better ways to do this; which is great,
# but for now, the focus is just to get it done. ie dict lookups vs array searches
for fpath in file_mapping[clas].keys():
if Path(fpath) not in flist:# remember flist consists of Posix-paths, not strings
removal_urls[clas].append(file_mapping[clas][fpath])
# remove links from the url files
for aircraft_type in removal_urls.keys():
fpath = path/'fighterjet-urls'/(aircraft_type + '.txt')
with open(fpath) as f: text_file = [line for line in f] # open file; read lines
for i,line in enumerate(text_file):
line = line.rstrip() # remove trailing /n for searching
if line in removal_urls[aircraft_type]: text_file.pop(i) # remove line from text file
with open(fpath, mode='wt') as f: # this deletes the original file *I think*: https://stackoverflow.com/a/11469328
for line in text_file: f.write(line)
# add contents of removal_urls to the master broken links file
with open(path/'fighterjet-failed-links.txt', mode='a') as f:
for c in removal_urls.keys():
f.writelines(f'{c}\n')
for line in removal_urls[c]:
f.writelines(f'{line}\n')
# save removal_urls to disk (not sure if I'll keep this or the other file)
torch.save(removal_urls, path/'removal_urls.pkl')
The filemapping is no longer relevant since the images will be redownloaded on the other machine, and will have new mappings.
tot = 0
for clas in aircraft_types: tot += len((path/clas).ls())
tot
6373
The original dataset size was 10,241 images, this's been cleaned down to 6,373.