Title: Storage for MMIR Author: Thomas Breuel Institution: UniKL
covering:
image formats:
important tools:
iname = "raw_images/DSCN0243.JPG"
image = imread(iname)
imshow(image)
<matplotlib.image.AxesImage at 0x5e96910>
# getting metadata
import pyexiv2
metadata = pyexiv2.ImageMetadata(iname)
metadata.read()
for key in metadata.exif_keys[::10]:
print key,"\t",metadata[key]
Exif.Image.ImageDescription <Exif.Image.ImageDescription [Ascii] = > Exif.Image.ExifTag <Exif.Image.ExifTag [Long] = 252> Exif.Photo.CompressedBitsPerPixel <Exif.Photo.CompressedBitsPerPixel [Rational] = 2/1> Exif.Nikon3.Version <Exif.Nikon3.Version [Undefined] = 0 2 0 0> Exif.Nikon3.ISOSelection <Exif.Nikon3.ISOSelection [Ascii] = AUTO > Exif.Nikon3.0x0015 <Exif.Nikon3.0x0015 [Ascii] = AUTO > Exif.Nikon3.0x0030 <Exif.Nikon3.0x0030 [Short] = 0> Exif.Nikon3.ShotInfo <Exif.Nikon3.ShotInfo [Undefined] = (Binary value suppressed)> Exif.Nikon3.0x00b5 <Exif.Nikon3.0x00b5 [Short] = 4098> Exif.Photo.SceneType <Exif.Photo.SceneType [Undefined] = 1> Exif.Photo.Sharpness <Exif.Photo.Sharpness [Short] = 0>
# image conversion and size
!convert raw_images/DSCN0243.JPG DSCN0243.ppm
!convert DSCN0243.ppm DSCN0243.png
!ls -lh raw_images/DSCN0243.JPG DSCN0243.*
!rm DSCN0243.ppm DSCN0243.png
-rw-r--r-- 1 tmb tmb 916K Oct 31 04:17 DSCN0243.png -rw-r--r-- 1 tmb tmb 2.3M Oct 31 04:17 DSCN0243.ppm -rw-r--r-- 1 tmb tmb 201K Oct 30 14:33 raw_images/DSCN0243.JPG
# JPEG quality settings
from PIL import Image
im = Image.open(iname)
im.save("temp.jpg",quality=1)
subplot(121); imshow(imread("temp.jpg")[200:400,200:400])
im.save("temp.jpg",quality=80)
subplot(122); imshow(imread("temp.jpg")[200:400,200:400])
<matplotlib.image.AxesImage at 0x69ddf10>
Remember JPEG Compression:
# in-memory compression requires streams
import StringIO
stream = StringIO.StringIO()
im.save(stream,'jpeg')
compressed = stream.getvalue()
print len(compressed)
84770
# useful in-memory compression functions
import PIL,StringIO
def cjpeg(image,quality=98):
im = PIL.Image.fromarray(image)
stream = StringIO.StringIO()
im.save(stream,'jpeg',quality=quality)
compressed = stream.getvalue()
return compressed
def djpeg(compressed):
stream = StringIO.StringIO(compressed)
im = PIL.Image.open(stream)
return numpy.array(im)
increasing performance:
# testing in-memory compression
compressed = cjpeg(imread(iname))
decompressed = djpeg(compressed)
imshow(decompressed)
print prod(decompressed.shape),len(compressed)
2359296 275947
Both for display and for analysis, it's useful to store icons of the images being indexed.
from scipy.ndimage import interpolation
def make_icon(image,size=(256,256)):
if amax(image)>1.0: image = image/255.0
assert amin(image)>=0.0 and amax(image)<=1.0
h,w = size
ih,iw,d = image.shape
scale = min(h*1.0/ih,w*1.0/iw)
scaled = interpolation.zoom(image,(scale,scale,1))
sh,sw,_ = scaled.shape
result = zeros((h,w,3))
dh,dw = h//2-sh//2,w//2-sw//2
result[dh:dh+sh,dw:dw+sw,:] = scaled
return array(result*255,'B')
image = imread(iname)
icon = make_icon(image,(64,64))
print icon.shape,icon.dtype
imshow(icon,interpolation='none')
(64, 64, 3) uint8
<matplotlib.image.AxesImage at 0x76c5750>
import md5
def md5file(fname):
m = md5.new()
with open(fname,"rb") as stream:
while 1:
s = stream.read(4096)
if s=="": break
m.update(s)
return m.hexdigest()
md5file(iname)
'5640ebaac01fc22241874189d85dff9a'
# opening the database, creating the table
import sqlite3
!rm -f sample.db
dbname = "sample.db"
db = sqlite3.connect(dbname)
c = db.cursor()
c.execute("create table images (chk text primary key,path text,w integer,h integer,jpeg blob,icon blob,time date)")
<sqlite3.Cursor at 0x5aecf10>
# what we are storing
chk = md5file(iname) # MD5 checksum of the file
image = imread(iname) # JPEG-compressed image
jpeg = cjpeg(image)
h,w,d = image.shape # dimensions of the image
icon = make_icon(image) # an icon (uncompressed)
when = datetime.datetime.now()
# inserting the data into the database
# note the use of `buffer` for blobs
values = (chk,iname,w,h,buffer(jpeg),buffer(icon),when)
c.execute("insert into images values (?,?,?,?,?,?,?)",values)
db.commit()
# retrieving a jpeg image
result = list(c.execute("select jpeg from images limit 1"))
imshow(djpeg(result[0][0]))
<matplotlib.image.AxesImage at 0x76f6a10>
# retrieving an icon
result = list(c.execute("select icon from images limit 1"))
imshow(array(result[0][0],'B').reshape(256,256,3))
<matplotlib.image.AxesImage at 0x6d30810>
# getting metadata
import pyexiv2
from datetime import datetime
def image2db(iname,db,c):
chk = md5file(iname) # MD5 checksum of the file
if list(c.execute("select chk from images where chk=?",(chk,)))!=[]:
c.execute("update images set path=? where chk=?",(iname,chk))
db.commit()
return False
# get metadata
metadata = pyexiv2.ImageMetadata(iname)
metadata.read()
if "Exif.Photo.DateTimeOriginal" in metadata:
when = metadata["Exif.Photo.DateTimeOriginal"].value
else:
when = datetime.now()
# get content data
image = imread(iname) # JPEG-compressed image
jpeg = cjpeg(image)
h,w,d = image.shape # dimensions of the image
icon = make_icon(image) # an icon (uncompressed)
values = (chk,iname,w,h,buffer(jpeg),buffer(icon),when)
c.execute("insert or replace into images values (?,?,?,?,?,?,?)",values)
db.commit()
return True
# clear out the database
c.execute("delete from images")
db.commit()
import glob
for i,iname in enumerate(sorted(glob.glob("raw_images/*.JPG"))):
if i%10==0: print i,iname
if not image2db(iname,db,c):
print iname,"already in the database"
0 raw_images/DSCN0149.JPG 10 raw_images/DSCN0204.JPG 20 raw_images/DSCN0254.JPG 30 raw_images/DSCN0286.JPG 40 raw_images/DSCN0320.JPG 50 raw_images/DSCN0351.JPG 60 raw_images/DSCN0424.JPG 70 raw_images/DSCN0555.JPG 80 raw_images/DSCN0596.JPG 90 raw_images/DSCN0639.JPG 100 raw_images/DSCN0659.JPG 110 raw_images/DSCN0720.JPG 120 raw_images/DSCN0770.JPG 130 raw_images/P1010139.JPG 140 raw_images/P1010209.JPG 150 raw_images/P1010291.JPG 160 raw_images/P1010367.JPG 170 raw_images/P1010409.JPG 180 raw_images/P1010458.JPG 190 raw_images/P1010496.JPG 200 raw_images/P1010536.JPG
# make sure the checksum-based duplicate detection is actually working
for i,iname in enumerate(sorted(glob.glob("raw_images/*.JPG")[:5])):
if i%10==0: print i,iname
if not image2db(iname,db,c):
print iname,"already in the database"
0 raw_images/DSCN0225.JPG raw_images/DSCN0225.JPG already in the database raw_images/DSCN0557.JPG already in the database raw_images/DSCN0642.JPG already in the database raw_images/DSCN0649.JPG already in the database raw_images/P1010456.JPG already in the database
# an SQL query to determine all the different size images
print list(c.execute("select count(*) from images"))
print list(c.execute("select distinct w,h from images"))
[(207,)] [(1024, 179), (1024, 768), (768, 1024), (1024, 328), (1024, 421), (1024, 164), (1024, 90), (328, 1024), (1024, 683), (683, 1024), (1024, 576), (576, 1024)]
# cleaning up and size comparisons
db.close()
!du -h raw_images
!ls -lh sample.db
49M raw_images -rw-r--r-- 1 tmb tmb 106M Oct 31 04:19 sample.db
HDF5:
HDF5 is a database for large scientific datasets in array format.
import tables
from tables import openFile,Filters,Int32Atom,Float32Atom,Int64Atom,UInt8Atom,VLStringAtom
Persistent Arrays:
HDF5 effectively implements persistent arrays or memory mapped arrays (the actual implementation may or may not use memory mapping, but it looks that way from Python):
hdf = tables.openFile("temp.h5","w")
hdf.createEArray("/","icons",UInt8Atom(),shape=[0,256,256,3])
hdf.root.icons.append([icon])
hdf.close()
hdf = tables.openFile("temp.h5","r+")
print len(hdf.root.icons)
imshow(hdf.root.icons[0])
hdf.close()
1
VLArrays, Strings:
You can have variable length arrays of variable length strings. Strings are special and a bit different from numerical arrays. You can also use VLArrays with 1D numerical arrays, giving a ragged array. If you want to store variable sized 2D images, you need to store the dimensions in a separate column and reshape on access.
hdf = tables.openFile("sample6.h5","w")
hdf.createVLArray("/","chk",VLStringAtom())
hdf.root.chk.append(u"hello")
hdf.root.chk.append("world")
hdf.root.chk[0:2]
hdf.close()
# store all the icons from the image database in HDF5
hdf = tables.openFile("sample.h5","w")
hdf.createVLArray("/","chk",VLStringAtom())
hdf.createEArray("/","icons",UInt8Atom(),shape=[0,256,256,3],filters=tables.Filters(9))
db = sqlite3.connect(dbname)
c = db.cursor()
rows = c.execute("select chk,icon from images")
for row in rows:
chk = str(row[0])
icon = array(row[1],'B').reshape(256,256,3)
hdf.root.chk.append(chk)
hdf.root.icons.append([icon])
db.close()
hdf.close()
!ls -lh sample.h5
!h5ls sample.h5
-rw-r--r-- 1 tmb tmb 20M Oct 31 04:20 sample.h5 chk Dataset {207/Inf} icons Dataset {207/Inf, 256, 256, 3}
# quickly accessing HDF5 content
with tables.openFile("sample.h5","r") as hdf:
for i in range(9):
subplot(3,3,i+1)
imshow(hdf.root.icons[i])
Storage: