#!/usr/bin/env python # coding: utf-8 # # OpenRefine Client # # Notebook demonstrating how to control OpenRefine via a Python client. # # Use the `dbutlerdb/refine-client-py` fork of `PaulMakepeace/refine-client-py` for Python3 support. # # I'm not yet convinced this is sensible or that the python client is as useable as it might be? Things like `pandas` are perfectly serviceable for working with tabular data in a notebook, so why would we want to use the OpenRefine engine? # ## Getting Started # # To start, ensure that the OpenRefine application server is running. You can start it from the notebook homepage (`New -> OpenRefine Session`. # # The server connection is looked for on the default port 3333. This can be hardcoded as part of the `nbopenrefineder` OpneRefine start-up command. # In[1]: import os #We might have to spot these in the notebook server log - the terminal output where the notebook server is started #If we're in MyBinder, that's not much help though? #os.environ['OPENREFINE_HOST']='127.0.0.1' #os.environ['OPENREFINE_PORT']='39113' # In[3]: from open.refine import refine #What is the distinction between these two? #Can we make use of it in a Jupyter context somehow? server = refine.RefineServer() orefine = refine.Refine(server) # We can list any projects that currently exist: # In[5]: orefine.list_projects().items() # ## Creating Projects # # Projects can be created via the API from a source data file. # In[255]: get_ipython().run_cell_magic('capture', '', 'try:\n import pandas as pd\nexcept:\n !pip install pandas\n import pandas as pd\n') # In[256]: #Create a simple test data file as a CSV file #via https://github.com/dbutlerdb/refine-client-py/blob/master/tests/data/duplicates.csv data = '''email,name,state,gender,purchase danny.baron@example1.com,Danny Baron,CA,M,TV melanie.white@example2.edu,Melanie White,NC,F,iPhone danny.baron@example1.com,D. Baron,CA,M,Winter jacket ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight arthur.duff@example4.com,Arthur Duff,OR,M,Dining table danny.baron@example1.com,Daniel Baron,CA,M,Bike jean.griffith@example5.org,Jean Griffith,WA,F,Power drill melanie.white@example2.edu,Melanie White,NC,F,iPad ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier arthur.duff@example4.com,Arthur Duff,OR,M,Night table''' fn = 'test.csv' with open(fn,'w') as f: f.write(data) # Either copy and paste the above data into a new OpenRefine project as a *clipboard* item or load it in as a CSV data file. # # We should now be able to see the project via the project index list: # In[257]: #For some reason, the project does not appear to get named? orefine.list_projects() # ## Opening an OpenRefine Project # # To open an OpenRefine project, we need to get it's key value from the project list. (Note that different projects may have the same name.) # In[258]: dir(orefine) # In[259]: KEY = '2165169842672' #Need to use a valid key KEY = list(orefine.list_projects().keys())[0] p = orefine.open_project(KEY) # In[260]: #Inspect the column names p.columns # In[261]: #Rename a column col = p.columns[-1] p.rename_column(col,'{}2'.format(col)) p.columns, p.column_order # *Hmmm, it looks like there's a duplicate there?* # ### Adding A Column # # OpenRefine allows you to add a column derived from one or more other columns. # # ``` # .add_column(oldColumn, newColumn, expression="value", column_insert_index=None, on_error='set-to-blank']) # ``` # # The `value` keyword denotes using the cell values from the original column. # In[262]: p.add_column('name', 'name2', column_insert_index=2) # *How do we access the history from the API? Or do we have to create one ourselves from command responses?* # In[263]: #Add a column based on the name p.add_column('name', 'NAME', "toUppercase(value)") # For a list of supported GREL (Google Refine Expression Language) commands, see [here](https://github.com/OpenRefine/OpenRefine/wiki/GREL-String-Functions). # ### Delete A Column # # The `remove column` OpenRefine option doesn't appear to be supported? # ## Export CSV # # The export appears to work on the dataset with current filters set. This is distinct from the complere dataset. # In[264]: import pandas as pd from io import StringIO pd.read_csv( StringIO( p.export(export_format='csv') ) ) # In[265]: def show_export(p): display( pd.read_csv( StringIO( p.export(export_format='csv') ) ) ) # Via https://stackoverflow.com/a/53742983/454773 it looks like there may be an `options` parmamter that allows for column selection. This is not currently supported by the Python client. # ## Display Table With Current Filters # # We can display the state of the table with current project filters set. # In[266]: def show_table(p): ''' Display currently selected rows in the table. ''' cells = [ [col['v'] for col in row['cells']] for row in p.get_rows().rows.rows_response ] df = pd.DataFrame( cells ) #The list of columns seems to include historical items #But how do we also guarantee the current one? dicts are inherently unordered? cols = ['Unnamed_{}'.format(i) for i in range(len(df.columns))] for (k,v) in sorted(p.column_order.items(), key=lambda kv: kv[1]): cols[v]=k #Set the column names guessed at - is there a better way? df.columns = cols display ( df ) #columns = [n for n in p.column_order] #How do we get the full list of column names? # In[267]: show_table(p) # ## Table Manipulation # # How can we manipulate the data table? # ### Column Names and Column Order # # We can look up the current column names and column order as follows: # In[268]: p.column_order # Rearrange column order: # In[269]: p.reorder_columns(['email','name','state','gender','purchase2','NAME','name2']) show_export(p) # In[270]: p.reorder_columns(['email','name']) show_export(p) # In[271]: p.reorder_columns(['email','name','state','gender','purchase2','NAME','name2']) show_export(p) # In[272]: p.columns # Although the full set of columns aren't in the exportable view of the table, they do still exist: # In[273]: p.get_rows().rows.rows_response[0] # So how do we reset that? We presumably need to unstep the project history? # ### Sorting # # Data in the table can be sorted on one or more columns, each ascending or descending. # In[274]: from open.refine.facet import Sorting # In[275]: sorting = Sorting('email') sorting.criteria # In[276]: Sorting(['name', 'gender']).criteria # In[277]: p.sorting = Sorting('email') p.reorder_rows() show_table(p) # In[278]: #Seems we can be more explicit - but are the missing items (e.g. valueType, caseSensitive) important? Sorting([{'column':'name','reverse':True}]).criteria # In[279]: p.sorting = Sorting(['email',{'column':'name','reverse':True}]) p.reorder_rows() show_table(p) # In[280]: p.get_rows().rows.rows_response[0] # ### Unset filters # # It looks like we can use `.engine.reset_all()` to reset filters. # In[281]: p.engine.reset_all() show_table(p) # In[282]: #The export filter still seems to be set though? show_export(p) # In[283]: #Also need an example with numeric type # ### Project history # # How do we explore a project's history? # In[284]: def show_history_item(h): return h.time, h.id,h.description show_history_item(p.history_entry) # ## Creating an OpenRefine Project # # *This doesn't seem to work?* # # *It would be useful to have a simple recipe for creating a project from a pandas dataframe.* # In[285]: import os #Create an OpenRefine project from the data file #Use the absolute path to the file p2 = orefine.new_project(project_file=os.path.abspath(fn), project_name='Test 1', project_file_name=fn) #Do we have to mediate this via a file transfer? eg could we go more directly from a pandas dataframe somehow? # In[286]: #For some reason, the project does not appear to get named? #There also can be a delay before the index listing shows that the data has been loaded? orefine.list_projects() # ### Delete a Project # # Take care with this one — there is no warning... # In[287]: p2.delete() # In[288]: dir(refine.RefineProject) # ## Facets and Clustering # # OpenRefine supports facetting on a column, including facet counts, and several clustering algorithms. # ### Facets # In[289]: from open.refine import facet fr=p.compute_facets(facet.TextFacet('name')) # In[290]: facets = fr.facets[0] for k in sorted(facets.choices, key=lambda k: facets.choices[k].count, reverse=True): print(facets.choices[k].count, k) # In[293]: facet_value = facet.TextFacet(column='email', selection=['danny.baron@example1.com','arthur.duff@example4.com']) facet_value.as_dict() # In[294]: #The export doesn't appear to be affected? p.engine.add_facet(facet_value) p.compute_facets() show(p) # In[295]: #But the table view is? show_table(p) # In[296]: #Reset the facet filter facet_value.reset() facet_value.as_dict() # In[300]: show_table(p) # ### Clustering # # Open Refine supports several clustering methods: # # - `clusterer_type: binning; refine_function: fingerprint|metaphone3|cologne-phonetic` # - `clusterer_type: binning; refine_function: ngram-fingerprint; params: {'ngram-size': INT}` # - `clusterer_type: knn; refine_function: levenshtein|ppm; params: {'radius': FLOAT,'blocking-ngram-size': INT}` # In[291]: clusters=p.compute_clusters('name', clusterer_type='binning', refine_function='cologne-phonetic') for cluster in clusters: print(cluster) # In[292]: clusters=p.compute_clusters('name', clusterer_type='knn', refine_function='levenshtein', params={'radius':3}) for cluster in clusters: print(cluster) # So how do we then select and apply a correction amongst clustered values? # ## Reconciliation # # Reconciliation allows column values to be "reconciled" with data from third party services. # In[19]: data='''Name Diane Abbott Boris Johnson Boris Johnstone Diana Abbot Boris Johnston Joanna Lumley Boris Johnstone ''' fn = 'test2.csv' with open(fn,'w') as f: f.write(data) p=orefine.new_project(project_file=os.path.abspath(fn), project_name='Test 2', project_file_name=fn) # In[24]: p.get_reconciliation_services() # In[25]: p.get_reconciliation_service_by_name_or_url('Wikidata (en)') # In[26]: #Get suggested types p.guess_types_of_column('Name', 'https://tools.wmflabs.org/openrefine-wikidata/en/api') reconciliation_config = { 'mode': 'standard-service', 'service': service['url'], 'identifierSpace': service['identifierSpace'], 'schemaSpace': service['schemaSpace'], 'type': { 'id': reconciliation_type['id'], 'name': reconciliation_type['name'], }, 'autoMatch': True, 'columnDetails': [], }p.reconcile(column='Name', service='Wikidata (en)', reconciliation_type={'id': 'Q5', 'name': 'human'}) #Throws an error looking for 'server_url' rather than 'url' in get_reconciliation_service_by_name_or_url ? # In[27]: reconciliation_config = { 'mode': 'standard-service', 'service': 'https://tools.wmflabs.org/openrefine-wikidata/en/preview?id={{id}}', 'identifierSpace': 'http://www.wikidata.org/entity/', 'schemaSpace': 'http://www.wikidata.org/prop/direct/', 'type': { 'id': 'Q5', 'name': 'human', }, 'autoMatch': True, 'columnDetails': [], } # In[ ]: p.reconcile(column='Name', service='Wikidata (en)', reconciliation_config=reconciliation_config) # Reconciliation is asynchronous - how do we check the actual response when it arrives? # # Code suggests call `wait_until_idle()`, but where? Perhaps requires a fork in the code to embed it in `refine.py::RefineProject().reconcile()` prior to providing the response? # In[ ]: pd.read_csv( StringIO( p.export(export_format='csv') ) ) # See also: [OpenRefine Style Reconciliation Containers](https://blog.ouseful.info/2015/02/02/openrefine-style-reconciliation-containers/). It would be good to weave reconciliation service recipes into Binderised demos, perhaps even using a [Jupyter kernel gateway powered API](https://blog.ouseful.info/2017/09/06/building-a-json-api-using-jupyer-notebooks-in-under-5-minutes/) to allow reconciliation service definitions via a notebook? # # # # --- # ### Debug strategies # # I need to figure some recipes for debugging in notebooks... Indeed, debugging anywhere! # In[ ]: #%%debug import pdb; pdb.set_trace() p.guess_types_of_column('Name', 'Wikidata (en)') #c # In[ ]: pdb.pm() #locals()['s'] #q