#!/usr/bin/env python # coding: utf-8 # # OpenRefine Client # # Notebook demonstrating how to control headless OpenRefine via a Python client. # # Use the `dbutlerdb/refine-client-py` fork of `PaulMakepeace/refine-client-py` for Python3 support. # # I'm not yet convinced this is sensible or that the python client is as useable as it might be? Things like `pandas` are perfectly serviceable for working with tabular data in a notebook, so why would we want to use the OpenRefine engine? # ## Getting Started # # To start, ensure that the OpenRefine application server is running. You can start it from the notebook homepage (`New -> OpenRefine Session`. # # The server connection is looked for on the default port 3333. This can be hardcoded as part of the `nbopenrefineder` OpneRefine start-up command. # In[1]: from open.refine import refine #What is the distinction between these two? #Can we make use of it in a Jupyter context somehow? server = refine.RefineServer() orefine = refine.Refine(server) # We can list any projects that currently exist (this should be empty): # In[2]: orefine.list_projects().items() # ## Creating an OpenRefine Project # # # *It would be useful to have a simple recipe for creating a project from a pandas dataframe.* # In[5]: #Create a simple test data file as a CSV file #via https://github.com/dbutlerdb/refine-client-py/blob/master/tests/data/duplicates.csv data = '''email,name,state,gender,purchase danny.baron@example1.com,Danny Baron,CA,M,TV melanie.white@example2.edu,Melanie White,NC,F,iPhone danny.baron@example1.com,D. Baron,CA,M,Winter jacket ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight arthur.duff@example4.com,Arthur Duff,OR,M,Dining table danny.baron@example1.com,Daniel Baron,CA,M,Bike jean.griffith@example5.org,Jean Griffith,WA,F,Power drill melanie.white@example2.edu,Melanie White,NC,F,iPad ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier arthur.duff@example4.com,Arthur Duff,OR,M,Night table''' fn = 'test.csv' with open(fn,'w') as f: f.write(data) # In[6]: import os #Create an OpenRefine project from the data file #Use the absolute path to the file p2 = orefine.new_project(project_file=os.path.abspath(fn), project_name='Test 1', project_file_name=fn) #Do we have to mediate this via a file transfer? eg could we go more directly from a pandas dataframe somehow? # In[11]: #For some reason, the project does not appear to get named? #There also can be a delay before the index listing shows that the data has been loaded? orefine.list_projects() # In[14]: import pandas as pd # In[15]: def show_table(p): ''' Display currently selected rows in the table. ''' cells = [ [col['v'] for col in row['cells']] for row in p.get_rows().rows.rows_response ] df = pd.DataFrame( cells ) #The list of columns seems to include historical items #But how do we also guarantee the current one? dicts are inherently unordered? cols = ['Unnamed_{}'.format(i) for i in range(len(df.columns))] for (k,v) in sorted(p.column_order.items(), key=lambda kv: kv[1]): cols[v]=k #Set the column names guessed at - is there a better way? df.columns = cols display ( df ) #columns = [n for n in p.column_order] #How do we get the full list of column names? # In[16]: show_table(p2)