Ident-O-Matic

This notebook lets you run files through various format identification tools and compare the results. The format tools are:

You can use one of the example files (taken from the Open Preservation Foundation Format Corpus), or (TBA) supply the URL of a public file, or upload your own file.

NOTE that while any files you upload to this cloud-hosted service should remain private, this cannot be guarenteed.

In [1]:
import os
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import fileupload
import subprocess
import tempfile
import glob
In [2]:
%%javascript
// This is necessary to stop the output area folding up
IPython.OutputArea.prototype._should_scroll = function(lines) {return false}
In [5]:
options = [
    ( 'Lorem Ipsum plain text file (lorem-ipsum.txt)', 'test-files/lorem-ipsum.txt'),
    ( 'Lorem Ipsum OpenDocument (lorem-ipsum-libreoffice-4.3.2.2.odt)', 'test-files/lorem-ipsum-libreoffice-4.3.2.2.odt'),
    ( 'Lorem Ipsum Microsoft Word (lorem-ipsum.doc)', 'test-files/lorem-ipsum.doc'),
    ( 'Lorem Ipsum HTML 4 (test-files/lorem-ipsum.htm)', 'test-files/lorem-ipsum.htm'),
    ( 'Lorem Ipsum PDF/A (lorem-ipsum.oo3.2.export-pdfa.pdf)', 'test-files/lorem-ipsum.oo3.2.export-pdfa.pdf'),
    ( 'A small movie file (png.mov)', 'test-files/png.mov' )
]

#This is where the results go...
results = widgets.Output()

tmp_dir = tempfile.mkdtemp()
tmp_file = None

def clear_all(b):
    select_input.value = options[0][1]
    input_url.value = ''
    results.clear_output()
    
def run_command(command, title):
    proc = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    (stdout, stderr) = proc.stdout.decode('utf-8'), proc.stderr.decode('utf-8')
    with results:
        display(HTML("<h2>%s</h2><pre>%s</pre><pre style='color: gray'>%s</pre>" % (title, stdout, stderr)))

def analyse_input(b):
    '''
    Try to open the input file, and start the analysis.
    '''
    # Hacky reliance on global here:
    global tab, tmp_file
    # This makes sure the results get used properly:
    results.clear_output()
    if tab.selected_index == 0:
        input_file = select_input.value
    elif tab.selected_index == 1:
        source_url = input_url.value
        raise Exception("Not yet supported!")
    elif tab.selected_index == 2:
        input_file = tmp_file

    run_command(["sf", "-sig", "deluxe.sig", input_file], "Siegfried ('deluxe' mode)")

    run_command(["tika.sh", "-d", input_file], "Apache Tika")

    run_command(["file", input_file], "File")

    run_command(["trid", input_file], "TrID")

    bin_sig = glob.glob("/usr/share/siegfried/DROID_SignatureFile_V*.xml")[0]
    con_sig = glob.glob("/usr/share/siegfried/container-signature-*.xml")[0]
    droid_cmd  = [ "droid.sh", 
                  "-q",
                  "-Nr", input_file, 
                  "-Ns", bin_sig,
                  "-Nc", con_sig ] 
    run_command(droid_cmd, "DROID")
        
    run_command(["fido", input_file], "Fido")

    run_command(["mediainfo", input_file], "MediaInfo")

    run_command(["ffprobe", "-hide_banner", input_file], "ffprobe")

    run_command(["github-linguist", input_file], "GitHub Linguist")

    run_command(["cloc", input_file], "CLOC")



def _cb(change):
    # Hacky reliance on global here:
    global tmp_file
    filename = change['owner'].filename
    tmp_file = os.path.join(tmp_dir, filename)
    #print('Storing to %s' % tmp_file)
    with open(tmp_file,"wb") as f:
        f.write(change['owner'].data)
    _upload_label.value= 'Uploaded `{}` ({:.2f} kB)'.format(
        filename, len(change['owner'].data) / 2 **10)

_upload_widget = fileupload.FileUploadWidget()
_upload_widget.observe(_cb, names='data')
_upload_label = widgets.Label(value="")

upload_tab = widgets.VBox([_upload_widget, _upload_label])

select_input = widgets.Dropdown(
        options=options,
        description='',
        disabled=False,
        layout=widgets.Layout(width='90%')
    )

input_url = widgets.Text(
        placeholder='Enter the URL to fetch',
        description='URL:',
        disabled=False,
        layout=widgets.Layout(width='90%')
    )

clear_button = widgets.Button(
        description='Clear',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Clear current data',
        icon=''
    )

analyse_button = widgets.Button(
        description='Analyse',
        disabled=False,
        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Analyse',
        icon=''
    )

clear_button.on_click(clear_all)
analyse_button.on_click(analyse_input)
select_note = widgets.HTML('Select an example file to analyse:')
select_tab = widgets.VBox([select_note, select_input])
tab = widgets.Tab(children=[select_tab, input_url, upload_tab ])
tab.set_title(0, 'Select an example')
tab.set_title(1, 'Fetch a URL')
tab.set_title(2, 'Upload a file')
display(widgets.VBox([tab, widgets.HBox([analyse_button, clear_button]), results]))

Some ideas for future implementation:

  • [ ] Tabular output
  • [ ] Also collect timings
  • [ ] Download results as CSV
  • [ ] MOAR TOOLS?! (e.g. Fido, MediaInfo, ffprobe, GitHub Linguist)
  • [ ] Option to select Siegfried signature set, and/or DROID signature version(s)?
  • [ ] Option to prevent tool from using the file extension.
  • [ ] Option to allow results to be kept/aggregated? (but no actual file data)

Created by Andrew Jackson. Inspired by Tim Sherratt's GLAM CSV Explorer.

See https://www.digipres.net/guides/format-id/ for more information.

In [ ]: