#!/usr/bin/env python # coding: utf-8 # # Question Answering with DeepMatcher # # Note: you can run **[this notebook live in Google Colab](https://colab.research.google.com/github/anhaidgroup/deepmatcher/blob/master/examples/question_answering.ipynb)**. # # DeepMatcher can be easily be used for text matching tasks such Question Answering, Text Entailment, etc. In this tutorial we will see how to use DeepMatcher for Answer Selection, a major sub-task of Question Answering. Specifically, we will look at [WikiQA](https://aclweb.org/anthology/D15-1237), a benchmark dataset for Answer Selection. There are three main steps in this tutorial: # # 1. Get data and transform it into DeepMatcher input format # 2. Setup and train DeepMatcher model # 3. Evaluate model using QA eval metrics # # Before we begin, if you are running this notebook in Colab, you will first need to install necessary packages by running the code below: # In[ ]: try: import deepmatcher except: get_ipython().system('pip install -qqq deepmatcher') # ## Step 1: Get data and transform it into DeepMatcher input format # # First let's import relevant packages and download the dataset: # In[1]: import deepmatcher as dm import pandas as pd import os get_ipython().system('wget -qnc https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip') get_ipython().system('unzip -qn WikiQACorpus.zip') # Let's see how this dataset looks like: # In[2]: raw_train = pd.read_csv(os.path.join('WikiQACorpus', 'WikiQA-train.txt'), sep='\t', header=None) raw_train.head() # Clearly, it is not in the format `deepmatcher` wants its input data to be in - this file has no column names, no ID column, and its not a CSV file. Let's fix that: # In[3]: raw_train.columns = ['left_value', 'right_value', 'label'] raw_train.index.name = 'id' raw_train.head() # Looks good, now let's save this to disk and transform the validation and test data in the same way: # In[4]: raw_train.to_csv(os.path.join('WikiQACorpus', 'dm_train.csv')) raw_files = ['WikiQA-dev.txt', 'WikiQA-test.txt'] csv_files = ['dm_valid.csv', 'dm_test.csv'] for i in range(2): raw_data = pd.read_csv(os.path.join('WikiQACorpus', raw_files[i]), sep='\t', header=None) raw_data.columns = ['left_value', 'right_value', 'label'] raw_data.index.name = 'id' raw_data.to_csv(os.path.join('WikiQACorpus', csv_files[i])) # ## Step 2: Setup and train DeepMatcher model # # Now we are ready to load and process the data for `deepmatcher`: # In[5]: train, validation, test = dm.data.process( path='WikiQACorpus', train='dm_train.csv', validation='dm_valid.csv', test='dm_test.csv') # Next, we create a `deepmatcher` model and train it. Note that since this is a demo, we do not perform hyperparameter tuning - we simply use the default settings for everything except the `pos_neg_ratio` param. This must be set since there are very few "positive matches" (candidates that correctly answer the question) in this dataset. In a real application setting you must tune other model hyperparameters as well to get optimal performance. # In[6]: model = dm.MatchingModel() model.run_train( train, validation, epochs=10, best_save_path='hybrid_model.pth', pos_neg_ratio=7) # Now that we have a trained model, we obtain the predictions for the test data. Note that `deepmatcher` computes F1, precision and recall by default but these may not be optimal evaluation metrics for your end task. For instance, in Question Answering, the more relevant metrics are MAP and MRR which we will compute in the next step. # In[7]: predictions = model.run_prediction(test, output_attributes=True) # ## Step 3: Evaluate model using QA eval metrics # Finally, we compute the Mean Average Precision (MAP) and Mean Reciprocal Rank (MRR) using the model's predictions on the test set. Following the approach of the [paper that introduced this dataset](https://aclweb.org/anthology/D15-1237), questions in the test set without answers are ignored when computing these metrics. # In[8]: MAP, MRR = 0, 0 grouped = predictions.groupby('left_value') num_questions = 0 for question, answers in grouped: sorted_answers = answers.sort_values('match_score', ascending=False) p, ap = 0, 0 top_answer_found = False for idx, answer in enumerate(sorted_answers.itertuples()): if answer.label == 1: if not top_answer_found: MRR += 1 / (idx + 1) top_answer_found = True p += 1 ap += p / (idx + 1) if p > 0: ap /= p num_questions += 1 MAP += ap MAP /= num_questions MRR /= num_questions print('MAP:', MAP) print('MRR:', MRR)