In [1]:
import time, os
from datetime import datetime
from time import mktime
from sys import stdout
from reportlab.lib import utils
In [2]:
search_dir = "test_img_dir"
ext_list = [".jpeg", ".jpg", ".png"]
date_since = "01/01/2000" # has to be > than 01/01/1970 
In [3]:
def image_size(path):
    from reportlab.lib import utils
    img = utils.ImageReader(path)
    w, h = img.getSize()
    return w, h

def file_size(path):
    import os
    try:
        return os.path.getsize(path)
    except:
        return 0

def mod_date(date_since, file_path):
    import os
    from datetime import datetime
    from time import mktime
    try:
        time_since = mktime(datetime.strptime(date_since, '%d/%m/%Y').timetuple())
        file_mod_time = os.path.getmtime(file_path)
    except:
        return True, 0
    if file_mod_time >= time_since:
        return True, file_mod_time
    else:
        return False, file_mod_time

def check_ext(f):
    import os
    name, ext = os.path.splitext(f)
    if ext in ext_list:
        return True
    else:
        return False
    
def time_stamp():
    import datetime
    return datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

def get_owner(f):
    """
    in case we want to get file owner (on windows)
    """
    try:
        import win32api, win32con, win32security
        sd = win32security.GetFileSecurity(f, win32security.OWNER_SECURITY_INFORMATION)
        owner_sid = sd.GetSecurityDescriptorOwner()
        name, _, _ = win32security.LookupAccountSid(None, owner_sid)
        return name
    except:
        return "-"
In [4]:
start_time = time.time()
results = "out/search_results" + "_" + time_stamp() + ".csv"
results_file = open(results, "w")
In [5]:
print("Walking through Files...")
i = 0
file_num = 0
output = "Filename,File Path,Last Mode Date,File Size (Bytes),Width (px),Height (px)\n"
for root, dirs, files in os.walk(search_dir):
    dirs.sort()
    for file in files:
        i += 1
        fullpath = os.path.join(root, file)
        print(".", end='')
        if (i)%100 == 0:
            print("\n\t", str(i), ":", fullpath.split("/")[8])
        if mod_date(date_since, fullpath)[0] and check_ext(fullpath):
            file_num += 1
        try:
            output += file + "," + str(fullpath) + "," + \
            datetime.fromtimestamp(int(mod_date(date_since, fullpath)[1])).strftime("%Y-%m-%d") + "," + \
            str(file_size(fullpath) ) + "," + str(image_size(fullpath)[0]) + "," + str(image_size(fullpath)[1]) + "\n"
        except:
            output += file + "," + "_problem_reading_file_as_image_" + "," + \
            datetime.fromtimestamp(int(mod_date(date_since, fullpath)[1])).strftime("%Y-%m-%d") + "," + \
            str(file_size(fullpath) ) + "\n"


results_file.write(output)
results_file.close()

print("\n\n", str(i), "Files (of all types) encountered")
print("Job ran for", str(round(time.time() - start_time, 1)), "seconds")
Walking through Files...
........................................

 40 Files (of all types) encountered
Job ran for 0.6 seconds

Analysis of the most "popular" sizes

In [6]:
from pandas import DataFrame, read_csv
import prettyplotlib as plt
import vincent
vincent.initialize_notebook()
In [7]:
df = read_csv("out/search_results_20140806_144738.csv")
In [8]:
df.head()
Out[8]:
Filename File Path Last Mode Date File Size (Bytes) Width (px) Height (px)
0 100151.jpg test_img_dir/100151.jpg 1970-01-01 3336 140 140
1 10021.jpg test_img_dir/10021.jpg 1970-01-01 4005 160 160
2 100223.jpg test_img_dir/100223.jpg 1970-01-01 3303 129 129
3 100235.jpg test_img_dir/100235.jpg 1970-01-01 3268 137 137
4 100244.jpg test_img_dir/100244.jpg 1970-01-01 1518 83 83
In [9]:
w = df["Width (px)"].value_counts().iloc[:20]
In [10]:
h = df["Height (px)"].value_counts().iloc[:20]

Top 20 Width Sizes in Pixels

In [11]:
bar = vincent.Bar(w.sort_index())
bar.axis_titles(x='Width, Pixels', y='Count')
bar.display()

Top 20 Heigth Sizes in Pixels

In [12]:
bar = vincent.Bar(h.sort_index())
bar.axis_titles(x='Heigth, Pixels', y='Count')
bar.display()