#!/usr/bin/env python # coding: utf-8 # # Winning Jeopardy! # # ![bhpb7lv5v0c31%20%281%29.jpg](attachment:bhpb7lv5v0c31%20%281%29.jpg) # ## Introduction # # Jeopardy! is an American television game show created by _Merv Griffin_. The show features a quiz competition in which contestants are presented with general knowledge clues in the form of answers, and must phrase their responses in the form of questions. For example, if a contestant were to select "Presidents for $200", the resulting clue could be "This 'Father of Our Country' didn't really chop down a cherry tree", to which the correct response is "Who is/was George Washington?" (Contestants are free to phrase the response in the form of any question; the traditional phrasing of "who is/are" for people or "what is/are" for things or words is almost always used.) # # ### Clarification: Clue instead of Question # # The column titled **'Question'** in the data file relates to the general knowledge **Clues** in the form of answers posed by the TV Show Host. So technically, they are not questions. Therefore, I will change the title heading from **'Question'** to **'Clue'**. # # The column title **'Answer'** is the response of the contestant and their response must be in the form of a question. Even though that's the case and the answers shown in the available file are not in question form, **I WILL NOT** change that column title to **'Question'**, but will leave it as **'Answer'.** # # The dataset is named jeopardy.csv, and contains 20000 rows from the beginning of a full dataset of Jeopardy questions, which you can download [here](https://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file/ 'here'). # ## Project Goal # # The goal of this project is to work with a dataset of Jeopardy questions to figure out if there are significant patterns in the questions that could help me win if I had an interest in being a future contestant. # ## File Description # # The provided file is an unordered list of questions where each question has: # # - 'category' : the question category, e.g. "HISTORY" # # - 'value' : \\$ value of the question as string, e.g. "$200" # # - Note: This is "None" for Final Jeopardy! and Tiebreaker questions # # - 'question' : text of question # # - **Note: This sometimes contains hyperlinks and other things messy text such as when there's a picture or video question** # # - 'answer' : text of answer # # - 'round' : one of "Jeopardy!","Double Jeopardy!","Final Jeopardy!" or "Tiebreaker" # # - Note: Tiebreaker questions do happen but they're very rare (like once every 20 years) # # - 'show_number' : string of show number, e.g '4680' # # - 'air_date' : the show air date in format YYYY-MM-DD # # Two of the column titles can be confusing. The column titled 'Question' # In[1]: # import key libraries to execute this project. import pandas as pd import numpy as np import random import string import matplotlib.pyplot as plt import seaborn as sns from numpy.random import seed, randint from IPython.display import HTML from IPython.display import display, Markdown import warnings warnings.filterwarnings('ignore') # read *.csv file provided. jeopardy_df = pd.read_csv('jeopardy.csv') # change column title from 'Question' to 'Clue' jeopardy_df = jeopardy_df.rename(columns={' Question':'Clue'}) display(Markdown('

First Five Rows

')) display(jeopardy_df.head()) print('\n') display(Markdown('

Last Five Rows

')) display(jeopardy_df.tail()) display(Markdown('

Column Titles

')) list(jeopardy_df.columns) # ## Observations # # NOTE: I see that if I need to reference certain columns, some titles have a space before the first letter. # ## Clean Clue Column # # # In[2]: # set column width wider to see more of question displayed. pd.set_option('display.max_colwidth', 85) # define function to remove punctuations from strings. def remove_punctuations(text): for punctuation in string.punctuation: text = text.replace(punctuation, '') return text # remove all punctuation from Clue column, apply to the DF series jeopardy_df['Clean_Clue'] = jeopardy_df['Clue'].apply(remove_punctuations) display(Markdown('

Punctuation Removed

')) display(jeopardy_df.iloc[:5,[5,7]]) # make all characters lower case in 'Clean Clue' column. display(Markdown('

Make All Lower Case

')) jeopardy_df['Clean_Clue'] = jeopardy_df['Clean_Clue'].str.lower() display(jeopardy_df.iloc[:5,[5,7]]) # reset column width to default. pd.reset_option('display.max_colwidth') # ## Words per Clue Statement # # Let's look at the full range of number of words per clue statement among the 20,000 clues. # I will use a box and whisker plot to see if there are any outliers that perhaps should be omitted from analysis. # In[3]: # create a column that contains the number of words in each clue. jeopardy_df['number_of_cwords'] = jeopardy_df.Clean_Clue.apply(lambda x: len(x.split())) y = jeopardy_df['number_of_cwords'] # use Seaborn coding to generate box and whisker plot. fig, ax = plt.subplots(figsize=(10,8)) sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5}) sns.boxplot(data=y, width = 0.35, linewidth=2.5, palette='Blues', showmeans=True, medianprops={'color':'red'}, meanprops={'marker':'o', 'markerfacecolor':'white', 'markeredgecolor':'black', 'markersize':'10'}) plt.title('Boxplot of Words per Clue Statement', fontsize=22, pad=20) plt.xlabel('Jeopardy Clue Statements', fontsize=20, labelpad = 20) plt.ylabel('Number of Words per Clue Statement', fontsize=20, labelpad = 20) ax.set(xticklabels=[]) plt.show() # ## Shortest and Longest Clue Statements # # There are outlier clue statements with more than 26 words and perhaps a few with less than 4. # I'm not sure if such outlier clues are worth keeping in the analysis. I will observe the nature of clue statements with many words and decide whether to remove them. I will also observe clues with three or less words and decide whether to exclude or include those. # # # In[4]: # extend column width to display as much of 'Clue' statement as possible. pd.set_option('display.max_colwidth', 110) # determine maximum number of words in each 'Clue'. column = jeopardy_df['number_of_cwords'] max_value = column.max() max_index = column.idxmax() print('\n') print('Clue Statement with maximum number of words =', max_value, 'and corresponding index is', max_index) display(Markdown('

An Example Clue with 65 Words

')) print(jeopardy_df.loc[436, 'Clean_Clue']) thirty_word_c = jeopardy_df[jeopardy_df['number_of_cwords']==30] display(Markdown('

Example Clues with 30 Words

')) display(thirty_word_c['Clean_Clue'].head(10)) display(Markdown('

Example Clues with Only 1 Word

')) one_word_c = jeopardy_df[jeopardy_df['number_of_cwords']==1] display(one_word_c.iloc[1:6, 3:10]) display(Markdown('

Example Clues with Only 2 Words

')) two_word_c = jeopardy_df[jeopardy_df['number_of_cwords']==2] display(two_word_c.iloc[1:6, 3:10]) display(Markdown('

Example Clues with Only 3 Words

')) three_word_c = jeopardy_df[jeopardy_df['number_of_cwords']==3] display(three_word_c.iloc[1:6, 3:10]) # create new dataframe name with various rows removed. jeopardy2_df = jeopardy_df[jeopardy_df['number_of_cwords'] < 30] print('There are', len(jeopardy2_df), 'remaining rows after removing clues with 30 words or more.') # return column width to default. pd.reset_option('display.max_colwidth') # ## Observations and Executive Decisions # # Based on the condition of clue statements with 30 words or more (messy, meaningless non-words, hyperlinks, etc.), I will remove all rows where clue statements have 30 words or more. That only reduces the total number of clue statements available to analyze by 1.8%. I really don't think removing these clues will negatively impact the analysis results. # # The example clue statements shown above with only 1, 2 or 3 words look to be legitimate. It's the category subject (e.g. 'IT'S OURS' or 'NOT A CURRENT NATIONAL CAPITAL', ...) that makes it possible to have clue statements with so few words. Therefore I will keep all clue statements intact with less than 4 words. # In[5]: # create a dataframe with two columns to generate two box and whisker plots. df = pd.DataFrame(columns = ['Before', 'After']) df['Before'] = jeopardy_df['number_of_cwords'] df['After'] = jeopardy2_df['number_of_cwords'] # use Seaborn coding to generate box and whisker plot. fig, ax = plt.subplots(figsize=(10,8)) sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5}) sns.boxplot(data=df, width = 0.35, linewidth=2.5, palette='Blues', showmeans=True, medianprops={'color':'red'}, meanprops={'marker':'o', 'markerfacecolor':'white', 'markeredgecolor':'black', 'markersize':'10'}) plt.title('Boxplot of Words per Clue Statement', fontsize=22, pad=20) plt.xlabel('Jeopardy Clue Statements', fontsize=20, labelpad = 20) plt.ylabel('Number of Words per Clue Statement', fontsize=20, labelpad = 20) plt.show() # ## Clean Answer Column # # # In[6]: # define function to remove punctuations from strings. def remove_punctuations(text): for punctuation in string.punctuation: text = text.replace(punctuation, '') return text # remove all punctuation from Answer column, apply to the DF series jeopardy2_df.loc[:,'Clean_Answer'] = jeopardy2_df.loc[:,' Answer'].apply(remove_punctuations) display(Markdown('

Punctuation Removed

')) display(jeopardy2_df.iloc[:5,[6,9]]) # make all characters lower case in 'Clean Answer' column. display(Markdown('

Make All Lower Case

')) jeopardy2_df.loc[:,'Clean_Answer'] = jeopardy2_df.loc[:,'Clean_Answer'].str.lower() display(jeopardy2_df.iloc[:5,[6,9]]) jeopardy2_df.loc[:,'number_of_awords'] = jeopardy2_df.loc[:,'Clean_Answer'].apply(lambda x: len(x.split())) column = jeopardy2_df['number_of_awords'] max_value = column.max() max_index = column.idxmax() print('\n') print('Answer Statement with maximum number of words =', max_value, 'and corresponding index is', max_index, '\n') display(Markdown('

Clue Statement to Longest Answer

')) print(jeopardy2_df.loc[18007, 'Clean_Clue'], '\n') display(Markdown('

Longest Answer

')) print(jeopardy2_df.loc[18007, 'Clean_Answer'], '\n') # ## Observations # # The longest 'Answer' statement looks legitimate. I will not remove and Answer statements. # In[7]: print(jeopardy2_df[' Value'].value_counts(dropna=False)) def remove_punctuations(val): for punctuation in string.punctuation: val = val.replace(punctuation, '') return val # remove punctuations from 'Value' column. jeopardy2_df['Clean_Value'] = jeopardy2_df[' Value'].apply(remove_punctuations) display(Markdown('

Punctuation Removed

')) display(jeopardy2_df.iloc[:5,[4,9]]) type = jeopardy2_df.dtypes['Clean_Value'] print(type) jeopardy2_df['Clean_Value'] = jeopardy2_df['Clean_Value'].replace(np.nan, 0) jeopardy2_df['Clean_Value'] = pd.to_numeric(jeopardy2_df['Clean_Value'], errors='coerce').fillna(0) jeopardy2_df['Clean_Value'] = jeopardy2_df['Clean_Value'].astype(int) display(Markdown('

Clean Value in Finished State

')) display(jeopardy2_df.iloc[:5,[4,11]]) jeopardy2_df[' Air Date'] = pd.to_datetime(jeopardy2_df[' Air Date']) print(jeopardy2_df.info()) # In[8]: def count_matches(row): split_answer = row["Clean_Answer"].split() split_question = row["Clean_Clue"].split() if "the" in split_answer: split_answer.remove("the") if len(split_answer) == 0: return 0 match_count = 0 for item in split_answer: if item in split_question: match_count += 1 return match_count / len(split_answer) jeopardy2_df["answer_in_clue"] = jeopardy2_df.apply(count_matches, axis=1) print(jeopardy2_df["answer_in_clue"].mean()) # In[9]: # sort dataframe by ascending 'Air Date'. jeopardy2_df.sort_values(by=[' Air Date']) display(jeopardy2_df.iloc[:5,1:7]) display(jeopardy2_df.iloc[-5:,1:7]) # create an empty list and an empty set. question_overlap = [] terms_used = set() for i, row in jeopardy2_df.iterrows(): split_question = row["Clean_Clue"].split(" ") split_question = [q for q in split_question if len(q) > 5] match_count = 0 for word in split_question: if word in terms_used: match_count += 1 for word in split_question: terms_used.add(word) if len(split_question) > 0: match_count /= len(split_question) question_overlap.append(match_count) jeopardy2_df["question_overlap"] = question_overlap jeopardy2_df["question_overlap"].mean() # In[10]: def determine_value(row): value = 0 if row["Clean_Value"] > 800: value = 1 return value jeopardy2_df["High_Value"] = jeopardy2_df.apply(determine_value, axis=1) # In[11]: def count_usage(term): low_count = 0 high_count = 0 for i, row in jeopardy2_df.iterrows(): if term in row["Clean_Clue"].split(" "): if row["High_Value"] == 1: high_count += 1 else: low_count += 1 return high_count, low_count # In[12]: from random import choice terms_used_list = list(terms_used) comparison_terms = [choice(terms_used_list) for _ in range(10)] observed_expected = [] for term in comparison_terms: observed_expected.append(count_usage(term)) observed_expected # In[13]: from scipy.stats import chisquare import numpy as np high_value_count = jeopardy2_df[jeopardy2_df["High_Value"] == 1].shape[0] low_value_count = jeopardy2_df[jeopardy2_df["High_Value"] == 0].shape[0] chi_squared = [] for obs in observed_expected: total = sum(obs) total_prop = total / jeopardy2_df.shape[0] high_value_exp = total_prop * high_value_count low_value_exp = total_prop * low_value_count observed = np.array([obs[0], obs[1]]) expected = np.array([high_value_exp, low_value_exp]) chi_squared.append(chisquare(observed, expected)) chi_squared # ## Conclusions # # None of the calculated p-values are less than 0.05. Therefore there is no signficant dfifference between observed and expected values. # # I can't really say there are any significant patterns observed in the questions that could help me win as a potential future contestant. I'm sure there is more analysis I could do, however I will leave that and move on with other coding training.