#!/usr/bin/env python # coding: utf-8 # # Exercise 1: Pseudo-feedback with Two-component Mixture Model # First, let's import the Python bindings for MeTA: # In[1]: import metapy # If you don't have `metapy` installed, you can install it with a # # ```bash # pip install metapy # ``` # # on the command line on Linux, macOS, or Windows for either Python 2.7 or Python 3.x. (I will be using Python 3.6 in this tutorial.) # Double-check that you are running the latest version. Right now, that should be `0.2.10`. # In[2]: metapy.__version__ # Now, let's set MeTA to log to standard error so we can see progress output for long-running commands. (Only do this once, or you'll get double the output.) # In[3]: metapy.log_to_stderr() # Now, let's download all of the files we need for the tutorial. # In[4]: import urllib.request import os import tarfile if not os.path.exists('sigir18-tutorial.tar.gz'): urllib.request.urlretrieve('https://meta-toolkit.org/data/2018-06-25/sigir18-tutorial.tar.gz', 'sigir18-tutorial.tar.gz') if not os.path.exists('data'): with tarfile.open('sigir18-tutorial.tar.gz', 'r:gz') as files: files.extractall() # Let's index our data using the `InvertedIndex` format. In a search engine, we want to quickly determine what documents mention a specific query term, so the `InvertedIndex` stores a mapping from term to a list of documents that contain that term (along with how many times they do). # In[5]: inv_idx = metapy.index.make_inverted_index('cranfield.toml') # This may take a minute at first, since the index needs to be built. Subsequent calls to `make_inverted_index` with this config file will simply load the index, which will not take any time. # # Here's how we can interact with the index object: # In[6]: inv_idx.num_docs() # In[7]: inv_idx.unique_terms() # In[8]: inv_idx.avg_doc_length() # In[9]: inv_idx.total_corpus_terms() # Let's search our index. We'll start by creating a ranker: # In[10]: ranker = metapy.index.DirichletPrior() # Now we need a query. Let's create an example query. # In[11]: query = metapy.index.Document() query.content("flow equilibrium") # Now we can use this to search our index like so: # In[12]: top_docs = ranker.score(inv_idx, query, num_results=5) top_docs # We are returned a ranked list of *(doc_id, score)* pairs. The scores are from the ranker, which in this case was Okapi BM25. Since the `tutorial.toml` file we created for the cranfield dataset has `store-full-text = true`, we can verify the content of our top documents by inspecting the document metadata field "content". # In[13]: for num, (d_id, _) in enumerate(top_docs): content = inv_idx.metadata(d_id).get('content') print("{}. {}...\n".format(num + 1, content[0:250])) # Since we have the queries file and relevance judgements, we can do an IR evaluation. # In[14]: ev = metapy.index.IREval('cranfield.toml') # We will loop over the queries file and add each result to the `IREval` object `ev`. # In[15]: def evaluate_ranker(ranker, ev, num_results): ev.reset_stats() with open('data/cranfield/cranfield-queries.txt') as query_file: for query_num, line in enumerate(query_file): query.content(line.strip()) results = ranker.score(inv_idx, query, num_results) avg_p = ev.avg_p(results, query_num + 1, num_results) print("Query {} average precision: {}".format(query_num + 1, avg_p)) evaluate_ranker(ranker, ev, 10) # Afterwards, we can get the mean average precision of all the queries. # In[16]: dp_map = ev.map() print("MAP: {}".format(dp_map)) # Now, let's use the two-component mixture model we discussed as an implementation of pseudo-feedback for retrieval and see if it helps improve performance. The actual ranking function used here is KL-divergence, where the query model is adjusted to include pseudo-feedback from the retrieved documents. # # In order to work, the ranker needs to be able to quickly determine what words were used in the feedback document set. The `InvertedIndex` does not provide fast access to this (since it is a mapping from term to documents, rather than from documents to terms), so we will want to first create a `ForwardIndex` to get the document -> terms mapping. # In[17]: fwd_idx = metapy.index.make_forward_index('cranfield.toml') # Now we can construct the KL-divergence pseudo-feedback ranker. The main components are: # 1. The forward index # 2. A base language-model ranker (here we'll use `DirichletPrior`) # 3. $\alpha$, the query interpolation parameter (how strongly do we prefer terms from the feedback model? default 0.5) # 4. $\lambda$, the language-model interpolation parameter (how strong is the background model in the two-component mixture? default 0.5) # 5. $k$, the number of documents to retrieve for the feedback set (default 10) # 6. `max_terms`, the number of terms from the feedback model to incorporate into the new query model (default 50) # In[18]: feedback = metapy.index.KLDivergencePRF(fwd_idx, metapy.index.DirichletPrior()) # In[19]: evaluate_ranker(feedback, ev, 10) # In[20]: fb_map = ev.map() print("Feedback MAP: {}".format(fb_map)) print("DP MAP: {}".format(dp_map))