#!/usr/bin/env python
# coding: utf-8

# # Exercise 1: Pseudo-feedback with Two-component Mixture Model

# First, let's import the Python bindings for MeTA:

# In[1]:


import metapy


# If you don't have `metapy` installed, you can install it with a
# 
# ```bash
# pip install metapy
# ```
# 
# on the command line on Linux, macOS, or Windows for either Python 2.7 or Python 3.x. (I will be using Python 3.6 in this tutorial.)

# Double-check that you are running the latest version. Right now, that should be `0.2.10`.

# In[2]:


metapy.__version__


# Now, let's set MeTA to log to standard error so we can see progress output for long-running commands. (Only do this once, or you'll get double the output.)

# In[3]:


metapy.log_to_stderr()


# Now, let's download all of the files we need for the tutorial.

# In[4]:


import urllib.request
import os
import tarfile

if not os.path.exists('sigir18-tutorial.tar.gz'):
    urllib.request.urlretrieve('https://meta-toolkit.org/data/2018-06-25/sigir18-tutorial.tar.gz',
                               'sigir18-tutorial.tar.gz')
    
if not os.path.exists('data'):
    with tarfile.open('sigir18-tutorial.tar.gz', 'r:gz') as files:
        files.extractall()


# Let's index our data using the `InvertedIndex` format. In a search engine, we want to quickly determine what documents mention a specific query term, so the `InvertedIndex` stores a mapping from term to a list of documents that contain that term (along with how many times they do).

# In[5]:


inv_idx = metapy.index.make_inverted_index('cranfield.toml')


# This may take a minute at first, since the index needs to be built. Subsequent calls to `make_inverted_index` with this config file will simply load the index, which will not take any time.
# 
# Here's how we can interact with the index object:

# In[6]:


inv_idx.num_docs()


# In[7]:


inv_idx.unique_terms()


# In[8]:


inv_idx.avg_doc_length()


# In[9]:


inv_idx.total_corpus_terms()


# Let's search our index. We'll start by creating a ranker:

# In[10]:


ranker = metapy.index.DirichletPrior()


# Now we need a query. Let's create an example query.

# In[11]:


query = metapy.index.Document()
query.content("flow equilibrium")


# Now we can use this to search our index like so:

# In[12]:


top_docs = ranker.score(inv_idx, query, num_results=5)
top_docs


# We are returned a ranked list of *(doc_id, score)* pairs. The scores are from the ranker, which in this case was Okapi BM25. Since the `tutorial.toml` file we created for the cranfield dataset has `store-full-text = true`, we can verify the content of our top documents by inspecting the document metadata field "content".

# In[13]:


for num, (d_id, _) in enumerate(top_docs):
    content = inv_idx.metadata(d_id).get('content')
    print("{}. {}...\n".format(num + 1, content[0:250]))


# Since we have the queries file and relevance judgements, we can do an IR evaluation.

# In[14]:


ev = metapy.index.IREval('cranfield.toml')


# We will loop over the queries file and add each result to the `IREval` object `ev`.

# In[15]:


def evaluate_ranker(ranker, ev, num_results):
    ev.reset_stats()
    with open('data/cranfield/cranfield-queries.txt') as query_file:
        for query_num, line in enumerate(query_file):
            query.content(line.strip())
            results = ranker.score(inv_idx, query, num_results)                            
            avg_p = ev.avg_p(results, query_num + 1, num_results)
            print("Query {} average precision: {}".format(query_num + 1, avg_p))
            
evaluate_ranker(ranker, ev, 10)


# Afterwards, we can get the mean average precision of all the queries.

# In[16]:


dp_map = ev.map()
print("MAP: {}".format(dp_map))


# Now, let's use the two-component mixture model we discussed as an implementation of pseudo-feedback for retrieval and see if it helps improve performance. The actual ranking function used here is KL-divergence, where the query model is adjusted to include pseudo-feedback from the retrieved documents.
# 
# In order to work, the ranker needs to be able to quickly determine what words were used in the feedback document set. The `InvertedIndex` does not provide fast access to this (since it is a mapping from term to documents, rather than from documents to terms), so we will want to first create a `ForwardIndex` to get the document -> terms mapping.

# In[17]:


fwd_idx = metapy.index.make_forward_index('cranfield.toml')


# Now we can construct the KL-divergence pseudo-feedback ranker. The main components are:
# 1. The forward index
# 2. A base language-model ranker (here we'll use `DirichletPrior`)
# 3. $\alpha$, the query interpolation parameter (how strongly do we prefer terms from the feedback model? default 0.5)
# 4. $\lambda$, the language-model interpolation parameter (how strong is the background model in the two-component mixture? default 0.5)
# 5. $k$, the number of documents to retrieve for the feedback set (default 10)
# 6. `max_terms`, the number of terms from the feedback model to incorporate into the new query model (default 50) 

# In[18]:


feedback = metapy.index.KLDivergencePRF(fwd_idx, metapy.index.DirichletPrior())


# In[19]:


evaluate_ranker(feedback, ev, 10)


# In[20]:


fb_map = ev.map()
print("Feedback MAP: {}".format(fb_map))
print("DP MAP: {}".format(dp_map))