This notebook lets you run files through various format identification tools and compare the results. The format tools are:
You can use one of the example files (taken from the Open Preservation Foundation Format Corpus), or (TBA) supply the URL of a public file, or upload your own file.
NOTE that while any files you upload to this cloud-hosted service should remain private, this cannot be guarenteed.
import os
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import fileupload
import subprocess
import tempfile
import glob
%%javascript
// This is necessary to stop the output area folding up
IPython.OutputArea.prototype._should_scroll = function(lines) {return false}
options = [
( 'Lorem Ipsum plain text file (lorem-ipsum.txt)', 'test-files/lorem-ipsum.txt'),
( 'Lorem Ipsum OpenDocument (lorem-ipsum-libreoffice-4.3.2.2.odt)', 'test-files/lorem-ipsum-libreoffice-4.3.2.2.odt'),
( 'Lorem Ipsum Microsoft Word (lorem-ipsum.doc)', 'test-files/lorem-ipsum.doc'),
( 'Lorem Ipsum HTML 4 (test-files/lorem-ipsum.htm)', 'test-files/lorem-ipsum.htm'),
( 'Lorem Ipsum PDF/A (lorem-ipsum.oo3.2.export-pdfa.pdf)', 'test-files/lorem-ipsum.oo3.2.export-pdfa.pdf'),
( 'A small movie file (png.mov)', 'test-files/png.mov' )
]
#This is where the results go...
results = widgets.Output()
tmp_dir = tempfile.mkdtemp()
tmp_file = None
def clear_all(b):
select_input.value = options[0][1]
input_url.value = ''
results.clear_output()
def run_command(command, title):
proc = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(stdout, stderr) = proc.stdout.decode('utf-8'), proc.stderr.decode('utf-8')
with results:
display(HTML("<h2>%s</h2><pre>%s</pre><pre style='color: gray'>%s</pre>" % (title, stdout, stderr)))
def analyse_input(b):
'''
Try to open the input file, and start the analysis.
'''
# Hacky reliance on global here:
global tab, tmp_file
# This makes sure the results get used properly:
results.clear_output()
if tab.selected_index == 0:
input_file = select_input.value
elif tab.selected_index == 1:
source_url = input_url.value
raise Exception("Not yet supported!")
elif tab.selected_index == 2:
input_file = tmp_file
run_command(["sf", "-sig", "deluxe.sig", input_file], "Siegfried ('deluxe' mode)")
run_command(["tika.sh", "-d", input_file], "Apache Tika")
run_command(["file", input_file], "File")
run_command(["trid", input_file], "TrID")
bin_sig = glob.glob("/usr/share/siegfried/DROID_SignatureFile_V*.xml")[0]
con_sig = glob.glob("/usr/share/siegfried/container-signature-*.xml")[0]
droid_cmd = [ "droid.sh",
"-q",
"-Nr", input_file,
"-Ns", bin_sig,
"-Nc", con_sig ]
run_command(droid_cmd, "DROID")
run_command(["fido", input_file], "Fido")
run_command(["mediainfo", input_file], "MediaInfo")
run_command(["ffprobe", "-hide_banner", input_file], "ffprobe")
run_command(["github-linguist", input_file], "GitHub Linguist")
run_command(["cloc", input_file], "CLOC")
def _cb(change):
# Hacky reliance on global here:
global tmp_file
filename = change['owner'].filename
tmp_file = os.path.join(tmp_dir, filename)
#print('Storing to %s' % tmp_file)
with open(tmp_file,"wb") as f:
f.write(change['owner'].data)
_upload_label.value= 'Uploaded `{}` ({:.2f} kB)'.format(
filename, len(change['owner'].data) / 2 **10)
_upload_widget = fileupload.FileUploadWidget()
_upload_widget.observe(_cb, names='data')
_upload_label = widgets.Label(value="")
upload_tab = widgets.VBox([_upload_widget, _upload_label])
select_input = widgets.Dropdown(
options=options,
description='',
disabled=False,
layout=widgets.Layout(width='90%')
)
input_url = widgets.Text(
placeholder='Enter the URL to fetch',
description='URL:',
disabled=False,
layout=widgets.Layout(width='90%')
)
clear_button = widgets.Button(
description='Clear',
disabled=False,
button_style='', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Clear current data',
icon=''
)
analyse_button = widgets.Button(
description='Analyse',
disabled=False,
button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Analyse',
icon=''
)
clear_button.on_click(clear_all)
analyse_button.on_click(analyse_input)
select_note = widgets.HTML('Select an example file to analyse:')
select_tab = widgets.VBox([select_note, select_input])
tab = widgets.Tab(children=[select_tab, input_url, upload_tab ])
tab.set_title(0, 'Select an example')
tab.set_title(1, 'Fetch a URL')
tab.set_title(2, 'Upload a file')
display(widgets.VBox([tab, widgets.HBox([analyse_button, clear_button]), results]))
VBox(children=(Tab(children=(VBox(children=(HTML(value='Select an example file to analyse:'), Dropdown(layout=…
Some ideas for future implementation:
Created by Andrew Jackson. Inspired by Tim Sherratt's GLAM CSV Explorer.
See https://www.digipres.net/guides/format-id/ for more information.