#!/usr/bin/env python # coding: utf-8 # # Winning Jeopardy # # Jeopardy is a popular TV show in the US where participants answer questions to win money. It's been running for a few decades, and is a major force in popular culture.

The dataset is named jeopardy.csv, and contains 20000 rows from the beginning of a full dataset of Jeopardy questions, which can be downloaded from [here](https://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file).

Data Dictionary: # # > Show Number - the Jeopardy episode number of the show this question was in.
# > Air Date - the date the episode aired.
# > Round - the round of Jeopardy that the question was asked in. Jeopardy has several rounds as each episode progresses.
# > Category - the category of the question.
# > Value - the number of dollars answering the question correctly is worth.
# > Question - the text of the question.
# > Answer - the text of the answer.
# # ### Aim # # **Let's say we want to compete on Jeopardy, and we're looking for any edge we can get to win. In this project, we'll work with a dataset of Jeopardy questions to figure out some patterns in the questions that could help us win.** # ### Introduction # # - We will extract data into pandas dataframe. # - Clean the dataset by dropping colimns with null values # - Clean the Column names # In[1]: import pandas as pd jeopardy = pd.read_csv('jeopardy.csv') jeopardy.dropna(inplace=True) jeopardy.head() # In[2]: jeopardy.columns # In[3]: # Removing spaces from column names jeopardy.columns = [x.strip() for x in jeopardy.columns] jeopardy.columns # In[4]: jeopardy.info() # ### Normalizing Text # # Before we begin to do analysis, we need to normalize all of the text columns (the `Question` and `Answer` columns). The idea is to lowercase words and remove punctuation so `Don't` and `don't` aren't considered to be different words while comparing them. # In[5]: import re def normalizing_string(string): string = string.lower() string = re.sub("[^A-Z0-9a-z\s]", "", string) return string # In[6]: jeopardy['clean_question'] = jeopardy['Question'].apply(normalizing_string) jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalizing_string) jeopardy.head() # ### Normalizing Columns # # There are some other columns to be normalized. # # The `Value` column should be numeric to manipulate it more easily. We'll need to remove the dollar sign from the beginning of each value and convert the column from text to numeric. # # The `Air Date` column should also be a datetime, not a string, enabling us to work with it more easily. # In[7]: def normalizing_values(value): value = re.sub("[^A-Z0-9a-z\s]", "", value) try: value = int(value) except Exception: value = 0 return value # In[8]: jeopardy['clean_value'] = jeopardy['Value'].apply(normalizing_values) jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date']) print(jeopardy.dtypes) jeopardy.head() # In order to figure out whether to study past questions, study general knowledge, or not study it all, it would be helpful to figure out two things: # # **How often the answer is deducible from the question.
# How often new questions are repeats of older questions.** # # We can answer the second question by seeing how often complex words (> 6 characters) reoccur. We can answer the first question by seeing how many times words in the answer also occur in the question. # # We'll work on the first question now, and come back to the second. # # --- # # ### How often the answer is deducible from the question. # In[15]: def function_ans_in_ques(row): split_answer = row['clean_answer'].split() split_question = row['clean_question'].split() match_count = 0 try: split_answer.remove('the') except ValueError: pass if len(split_answer) == 0: return 0 for element in split_answer: if element in split_question: match_count += 1 return match_count/len(split_answer) jeopardy['answer_in_question'] = jeopardy.apply(function_ans_in_ques, axis=1) jeopardy['answer_in_question'].mean() # In[16]: jeopardy[jeopardy['answer_in_question'] != 0][['clean_question', 'clean_answer']].head() # *The answer only appears in the question about 6% of the time. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.* # # --- # # ### How often new questions are repeats of older questions # In[17]: overlap_ratio = [] terms_repeated_in_ques = [] terms_repeated_overall = set() terms_used = set() for i, rows in jeopardy.iterrows(): split_question = rows['clean_question'].split(" ") terms = [x for x in split_question if len(x) > 5] ## Words which are more then 5 letters long temp = [] match_count = 0 for word in terms: if word in terms_used: match_count += 1 ## increases match count if word is already seen earlier terms_repeated_overall.add(word) ## adds word to the repeated words set (smaller then used words) temp.append(word) ## Word added in temporary array to be added in repeat words ## column in dataframe terms_used.add(word) ## adds word to the used words set if len(terms) > 0: match_count /= len(terms) overlap_ratio.append(match_count) terms_repeated_in_ques.append(temp) jeopardy['overlap_ratio'] = overlap_ratio jeopardy['overlap_terms'] = terms_repeated_in_ques jeopardy['overlap_ratio'].mean() # *There is a 87% overlap of words between new questions and old ones. However words can be put together as different phases with a big difference in meaning. So this huge overlap is not super significant.* # # --- # # ### Low-Value vs High-Value Questions # # Let's say we only want to study questions that pertain to high value questions instead of low value questions. This will help us earn more money when we're on Jeopardy. # # We can actually figure out which terms correspond to high-value questions using a chi-squared test. We'll first need to narrow down the questions into two categories: # # Low value - Any row where Value is less than 800.
# High value - Any row where Value is greater than 800. # In[13]: def high_or_low_value(row): value = 0 if row['clean_value'] > 800: value = 1 return value jeopardy['high_value'] = jeopardy.apply(high_or_low_value, axis=1) jeopardy.head() # *High Value column categorizes data into either High Value [1] or Low Value [0].* # # --- # # ### Observed Quantity of High Value vs Low Value Questions # # We will create a function that takes in a word, then return the # of high/low values questions this word showed up in. # In[19]: def high_or_low_count(word): low_count = 0 high_count = 0 for i, rows in jeopardy.iterrows(): split_question = rows['clean_question'].split(" ") if word in split_question: if rows['high_value'] == 1: high_count += 1 else: low_count += 1 return high_count, low_count observed_high_low = [] comparison_terms = list(terms_repeated_overall)[:5] for item in comparison_terms: observed_high_low.append(high_or_low_count(item)) observed_high_low # In[20]: comparison_terms # *For terms in `comparison_terms` the Observed High Count, Low Count is mentioned in `observed_high_low`* # # --- # # ### Applying the Chi-Squared test # # We can use the chi squared test to see if the values of the terms in "comparsion_terms" are statiscally significant. # # # For that, we will find the expected High Count, Low Count.
# Then, we will pass expected and observed values through `chisquare` function from `scipy.stats` to get the chi-value. # In[21]: from scipy.stats import chisquare import numpy as np high_value_count = jeopardy[jeopardy['high_value'] == 1].shape[0] low_value_count = jeopardy[jeopardy['high_value'] == 0].shape[0] chi_squared = [] for item in observed_high_low: total = sum(item) total_prop = total/jeopardy.shape[0] high_value_expected = total_prop*high_value_count low_value_expected = total_prop*low_value_count observed = np.array([item[0], item[1]]) expected = np.array([high_value_expected, low_value_expected]) chi_squared.append(chisquare(observed, expected)) chi_squared # ### Chi Squared Results # # None of the p values are less than 0.05, so these are not statiscally significant. # # ***However, if we perform the same test for all the words, then words with pvalue less then 0.05 and high chi-square value would be most valuable to study.***