#!/usr/bin/env python
# coding: utf-8

# # OpenRefine Client
# 
# Notebook demonstrating how to control OpenRefine via a Python client.
# 
# Use the `dbutlerdb/refine-client-py` fork of `PaulMakepeace/refine-client-py` for Python3 support.
# 
# I'm not yet convinced this is sensible or that the python client is as useable as it might be? Things like `pandas` are perfectly serviceable for working with tabular data in a notebook, so why would we want to use the OpenRefine engine?

# ## Getting Started
# 
# To start, ensure that the OpenRefine application server is running. You can start it from the notebook homepage (`New -> OpenRefine Session`.
# 
# The server connection is looked for on the default port 3333. This can be hardcoded as part of the `nbopenrefineder` OpneRefine start-up command.

# In[1]:


import os

#We might have to spot these in the notebook server log - the terminal output where the notebook server is started
#If we're in MyBinder, that's not much help though?
#os.environ['OPENREFINE_HOST']='127.0.0.1'
#os.environ['OPENREFINE_PORT']='39113'


# In[3]:


from open.refine import refine

#What is the distinction between these two?
#Can we make use of it in a Jupyter context somehow?

server = refine.RefineServer()
orefine = refine.Refine(server)


# We can list any projects that currently exist:

# In[5]:


orefine.list_projects().items()


# ## Creating Projects
# 
# Projects can be created via the API from a source data file. 

# In[255]:


get_ipython().run_cell_magic('capture', '', 'try:\n    import pandas as pd\nexcept:\n    !pip install pandas\n    import pandas as pd\n')


# In[256]:


#Create a simple test data file as a CSV file

#via https://github.com/dbutlerdb/refine-client-py/blob/master/tests/data/duplicates.csv
data = '''email,name,state,gender,purchase
danny.baron@example1.com,Danny Baron,CA,M,TV
melanie.white@example2.edu,Melanie White,NC,F,iPhone
danny.baron@example1.com,D. Baron,CA,M,Winter jacket
ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight
arthur.duff@example4.com,Arthur Duff,OR,M,Dining table
danny.baron@example1.com,Daniel Baron,CA,M,Bike
jean.griffith@example5.org,Jean Griffith,WA,F,Power drill
melanie.white@example2.edu,Melanie White,NC,F,iPad
ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier
arthur.duff@example4.com,Arthur Duff,OR,M,Night table'''


fn = 'test.csv'

with open(fn,'w') as f:
    f.write(data)


# Either copy and paste the above data into a new OpenRefine project as a *clipboard* item or load it in as a CSV data file.
# 
# We should now be able to see the project via the project index list:

# In[257]:


#For some reason, the project does not appear to get named?
orefine.list_projects()


# ## Opening an OpenRefine Project
# 
# To open an OpenRefine project, we need to get it's key value from the project list. (Note that different projects may have the same name.)

# In[258]:


dir(orefine)


# In[259]:


KEY = '2165169842672'  #Need to use a valid key
KEY = list(orefine.list_projects().keys())[0]

p = orefine.open_project(KEY)


# In[260]:


#Inspect the column names
p.columns


# In[261]:


#Rename a column

col = p.columns[-1]
p.rename_column(col,'{}2'.format(col))
p.columns, p.column_order


# *Hmmm, it looks like there's a duplicate there?*

# ### Adding A Column
# 
# OpenRefine allows you to add a column derived from one or more other columns.
# 
# ```
# .add_column(oldColumn, newColumn, expression="value", column_insert_index=None, on_error='set-to-blank'])
# ```
# 
# The `value` keyword denotes using the cell values from the original column.

# In[262]:


p.add_column('name', 'name2', column_insert_index=2)


# *How do we access the history from the API? Or do we have to create one ourselves from command responses?*

# In[263]:


#Add a column based on the name
p.add_column('name', 'NAME', "toUppercase(value)")


# For a list of supported GREL (Google Refine Expression Language) commands, see [here](https://github.com/OpenRefine/OpenRefine/wiki/GREL-String-Functions).

# ### Delete A Column
# 
# The `remove column` OpenRefine option doesn't appear to be supported?

# ## Export CSV
# 
# The export appears to work on the dataset with current filters set. This is distinct from the complere dataset.

# In[264]:


import pandas as pd
from io import StringIO

pd.read_csv( StringIO( p.export(export_format='csv') ) )


# In[265]:


def show_export(p):
    display( pd.read_csv( StringIO( p.export(export_format='csv') ) ) )


# Via https://stackoverflow.com/a/53742983/454773 it looks like there may be an `options` parmamter that allows for column selection. This is not currently supported by the Python client.

# ## Display Table With Current Filters
# 
# We can display the state of the table with current project filters set.

# In[266]:


def show_table(p):
    ''' Display currently selected rows in the table. '''
    
    cells = [ [col['v'] for col in row['cells']] for row in p.get_rows().rows.rows_response ]
    df = pd.DataFrame( cells )
    
    #The list of columns seems to include historical items
    #But how do we also guarantee the current one? dicts are inherently unordered?
    cols = ['Unnamed_{}'.format(i) for i in range(len(df.columns))]
    for (k,v) in sorted(p.column_order.items(), key=lambda kv: kv[1]):
        cols[v]=k
        
    #Set the column names guessed at - is there a better way?
    df.columns = cols
    
    display ( df ) #columns = [n for n in p.column_order]
    
    
#How do we get the full list of column names?


# In[267]:


show_table(p)


# ## Table Manipulation
# 
# How can we manipulate the data table?

# ### Column Names and Column Order
# 
# We can look up the current column names and column order as follows:

# In[268]:


p.column_order


# Rearrange column order:

# In[269]:


p.reorder_columns(['email','name','state','gender','purchase2','NAME','name2'])
show_export(p)


# In[270]:


p.reorder_columns(['email','name'])
show_export(p)


# In[271]:


p.reorder_columns(['email','name','state','gender','purchase2','NAME','name2'])
show_export(p)


# In[272]:


p.columns


# Although the full set of columns aren't in the exportable view of the table, they do still exist:

# In[273]:


p.get_rows().rows.rows_response[0]


# So how do we reset that? We presumably need to unstep the project history?

# ### Sorting
# 
# Data in the table can be sorted on one or more columns, each ascending or descending.

# In[274]:


from open.refine.facet import Sorting


# In[275]:


sorting = Sorting('email')

sorting.criteria


# In[276]:


Sorting(['name', 'gender']).criteria


# In[277]:


p.sorting = Sorting('email')
p.reorder_rows()

show_table(p)


# In[278]:


#Seems we can be more explicit - but are the missing items (e.g. valueType, caseSensitive) important?
Sorting([{'column':'name','reverse':True}]).criteria


# In[279]:


p.sorting = Sorting(['email',{'column':'name','reverse':True}])
p.reorder_rows()

show_table(p)


# In[280]:


p.get_rows().rows.rows_response[0]


# ### Unset filters
# 
# It looks like we can use `.engine.reset_all()` to reset filters.

# In[281]:


p.engine.reset_all()
show_table(p)


# In[282]:


#The export filter still seems to be set though?
show_export(p)


# In[283]:


#Also need an example with numeric type


# ### Project history
# 
# How do we explore a project's history?

# In[284]:


def show_history_item(h):
    return h.time, h.id,h.description

show_history_item(p.history_entry)


# ## Creating an OpenRefine Project
# 
# *This doesn't seem to work?*
# 
# *It would be useful to have a simple recipe for creating a project from a pandas dataframe.*

# In[285]:


import os

#Create an OpenRefine project from the data file
#Use the absolute path to the file
p2 = orefine.new_project(project_file=os.path.abspath(fn),
                      project_name='Test 1',
                      project_file_name=fn)

#Do we have to mediate this via a file transfer? eg could we go more directly from a pandas dataframe somehow?


# In[286]:


#For some reason, the project does not appear to get named?
#There also can be a delay before the index listing shows that the data has been loaded?
orefine.list_projects()


# ### Delete a Project
# 
# Take care with this one — there is no warning...

# In[287]:


p2.delete()


# In[288]:


dir(refine.RefineProject)


# ## Facets and Clustering
# 
# OpenRefine supports facetting on a column, including facet counts, and several clustering algorithms.

# ### Facets

# In[289]:


from open.refine import facet

fr=p.compute_facets(facet.TextFacet('name'))


# In[290]:


facets = fr.facets[0]
for k in sorted(facets.choices,
                key=lambda k: facets.choices[k].count,
                reverse=True):
    print(facets.choices[k].count, k)


# In[293]:


facet_value = facet.TextFacet(column='email',
                              selection=['danny.baron@example1.com','arthur.duff@example4.com'])
facet_value.as_dict()


# In[294]:


#The export doesn't appear to be affected?
p.engine.add_facet(facet_value)
p.compute_facets()

show(p)


# In[295]:


#But the table view is?
show_table(p)


# In[296]:


#Reset the facet filter
facet_value.reset()
facet_value.as_dict()


# In[300]:


show_table(p)


# ### Clustering
# 
# Open Refine supports several clustering methods:
# 
# - `clusterer_type: binning; refine_function: fingerprint|metaphone3|cologne-phonetic`
# - `clusterer_type: binning; refine_function: ngram-fingerprint; params: {'ngram-size': INT}`
# - `clusterer_type: knn; refine_function: levenshtein|ppm; params: {'radius': FLOAT,'blocking-ngram-size': INT}`

# In[291]:


clusters=p.compute_clusters('name',
                            clusterer_type='binning',
                            refine_function='cologne-phonetic')
for cluster in clusters:
    print(cluster)


# In[292]:


clusters=p.compute_clusters('name',
                            clusterer_type='knn',
                            refine_function='levenshtein',
                            params={'radius':3})
for cluster in clusters:
    print(cluster)


# So how do we then select and apply a correction amongst clustered values?

# ## Reconciliation
# 
# Reconciliation allows column values to be "reconciled" with data from third party services.

# In[19]:


data='''Name
Diane Abbott
Boris Johnson
Boris Johnstone
Diana Abbot
Boris Johnston
Joanna Lumley
Boris Johnstone
'''

fn = 'test2.csv'

with open(fn,'w') as f:
    f.write(data)
    
p=orefine.new_project(project_file=os.path.abspath(fn),
                      project_name='Test 2',
                      project_file_name=fn)


# In[24]:


p.get_reconciliation_services()


# In[25]:


p.get_reconciliation_service_by_name_or_url('Wikidata (en)')


# In[26]:


#Get suggested types
p.guess_types_of_column('Name',
                        'https://tools.wmflabs.org/openrefine-wikidata/en/api')

reconciliation_config = {
                'mode': 'standard-service',
                'service': service['url'],
                'identifierSpace': service['identifierSpace'],
                'schemaSpace': service['schemaSpace'],
                'type': {
                    'id': reconciliation_type['id'],
                    'name': reconciliation_type['name'],
                },
                'autoMatch': True,
                'columnDetails': [],
            }p.reconcile(column='Name',
            service='Wikidata (en)',
            reconciliation_type={'id': 'Q5', 'name': 'human'})
#Throws an error looking for 'server_url' rather than 'url' in get_reconciliation_service_by_name_or_url ?
# In[27]:


reconciliation_config = {
                'mode': 'standard-service',
                'service': 'https://tools.wmflabs.org/openrefine-wikidata/en/preview?id={{id}}',
                'identifierSpace': 'http://www.wikidata.org/entity/',
                'schemaSpace': 'http://www.wikidata.org/prop/direct/',
                'type': {
                    'id': 'Q5',
                    'name': 'human',
                },
                'autoMatch': True,
                'columnDetails': [],
            }


# In[ ]:


p.reconcile(column='Name',
            service='Wikidata (en)',
            reconciliation_config=reconciliation_config)


# Reconciliation is asynchronous - how do we check the actual response when it arrives?
# 
# Code suggests call `wait_until_idle()`, but where? Perhaps requires a fork in the code to embed it in `refine.py::RefineProject().reconcile()` prior to providing the response?

# In[ ]:


pd.read_csv( StringIO( p.export(export_format='csv') ) )


# See also: [OpenRefine Style Reconciliation Containers](https://blog.ouseful.info/2015/02/02/openrefine-style-reconciliation-containers/). It would be good to weave reconciliation service recipes into Binderised demos, perhaps even using a [Jupyter kernel gateway powered API](https://blog.ouseful.info/2017/09/06/building-a-json-api-using-jupyer-notebooks-in-under-5-minutes/) to allow reconciliation service definitions via a notebook?

#  

#  
# 

#  ---

# ### Debug strategies
# 
# I need to figure some recipes for debugging in notebooks... Indeed, debugging anywhere!

# In[ ]:


#%%debug
import pdb; pdb.set_trace()
p.guess_types_of_column('Name', 'Wikidata (en)')
#c


# In[ ]:


pdb.pm()
#locals()['s']
#q