import json DATA_PATH = 'cleaned_data.json' all_data = json.load(open(DATA_PATH)) import pandas as pd import numpy as np answers = pd.DataFrame(all_data) answers['duration'] = answers.submit_time - answers.srv_time answers.user_id = answers.user_id.astype(np.int) # Clamp response time to 2 minutes. answers.duration[answers.duration > 120] = 120 num_answered = answers.user_id.value_counts() completions = num_answered[num_answered > 10].index answers = answers[answers.user_id.isin(completions)] # Some of the survey experiments did not get many responses, filter them out source_counts = answers['type'].value_counts() # So like 3 people completed surveys from twitter :)... source_counts answers[0:3] answers = answers[answers.type.isin(source_counts[source_counts > 100].index)] import numpy as np grouped = answers.groupby(['type', 'question_id'])['duration'].agg({'mean': np.mean, 'count': len, 'std': np.std}) grouped questions = json.load(open('../app/survey.json')) text_by_id = {q['id']: q['question'] for q in questions['questions']} for q in questions['questions']: if 'prompt' not in q: continue answers.answer[(answers.question_id == q['id']) & (answers['answer'] == q['prompt'])] = 'DEFAULT' print(answers['type'].unique()) answers['type'].value_counts() %pylab inline import matplotlib.pyplot as plt def plot_single_question(question_id, survey_answers, answer_types, normalize=False, ax=None): durations = [survey_answers[(survey_answers.type == t) & (survey_answers.question_id == question_id)].duration for t in answer_types] if normalize: weights = [np.ones_like(d) / len(d) for d in durations] else: weights = None plt.hist(durations, label=list(answer_types), weights=weights) ax.legend(bbox_to_anchor=(1.7, .95)) title(text_by_id[question_id][:80] + ' (question {})'.format(question_id)) def plot_all_questions(survey_answers, answer_types, question_ids, normalize=False): size = 5 plt.figure(figsize=(size, len(question_ids) * size)) for (i, q) in enumerate(question_ids): ax = plt.subplot(len(question_ids), 1, i + 1) plot_single_question(q, survey_answers, answer_types, normalize=normalize, ax=ax) plot_all_questions(answers[answers.duration < 50], ['Turk, asking for Fast', 'Mechanical Turk 5 Cents'], list(1 + i for i in range(11)), normalize=True) from scipy.stats import gaussian_kde (fraudy, legit) = ('Turk, asking for Fast', 'Mechanical Turk 5 Cents') fraudy_timings = {} legit_timings = {} for i in range(11): q_id = i + 1 q_frame = answers.duration[answers.question_id == q_id] fraudy_timings[q_id] = gaussian_kde(q_frame[answers.type == fraudy]) legit_timings[q_id] = gaussian_kde(q_frame[answers.type == legit]) SIZE = 5 NUM_QUESTIONS = len(legit_timings) plt.figure(figsize=(SIZE, SIZE * NUM_QUESTIONS)) for i in sorted(fraudy_timings): plt.subplot(NUM_QUESTIONS, 1, i) title(text_by_id[i][:80] + ' (question {})'.format(i)) fraud_kde = fraudy_timings[i] legit_kde = legit_timings[i] x = np.arange(0, 120, .1) plt.yticks([]) plt.xlabel('Seconds to answer') plt.plot(x, fraud_kde.evaluate(x), 'r', label='more fraudulent') plt.plot(x, legit_kde.evaluate(x), 'g', label='more good') plt.legend() import re political_regex = re.compile('.*(obama|jfk|kennedy|ronald|reagan|regan|clinton|bill cl|' 'george washington|george w|dukakis|saddam|' 'bush|carter|nixon|modi|gorbachev|lincoln|trudeau|' 'brezhnev|perot|' 'mahatma gandhi|nehru|gingrich|martin luther king|mlk|' 'rajiv gandhi|ford|rajive gandhi|eisenhower|' 'rahul gandhi|indira gandhi|nelson mandela|white house|' 'gandhi|thatcher).*', re.IGNORECASE) answers['figure'] = answers[answers.question_id == 3]['answer'].str.match(political_regex) def get_first(l): if type(l) == tuple and l: return l[0].lower() answers.figure_clean = answers.figure.apply(get_first) answers.figure_clean.value_counts() # Someone famous in India who I had no clue about: answers.answer[answers.figure_clean == 'modi'] # Find the folks not captured by that regex: def empty_tuple(x): return type(x) == type([]) and not x list(answers[answers.figure.apply(empty_tuple) & (answers.answer != 'DEFAULT')].answer) def score_for_user(user_id, initial_fraud_probability=.1): fraud_probability = initial_fraud_probability nonfraud_probability = 1 - initial_fraud_probability the_data = answers[answers.user_id == user_id].sort(columns=('question_id',))[[ 'question_id', 'answer', 'duration']] partial_results = [] for r in the_data.iterrows(): (question_id, answer, duration) = r[1] fraud_likelihood = fraudy_timings[question_id].evaluate(duration)[0] fraud_probability *= fraud_likelihood nonfraud_likelihood = legit_timings[question_id].evaluate(duration)[0] nonfraud_probability *= nonfraud_likelihood normalizer = nonfraud_probability + fraud_probability fraud_probability /= normalizer nonfraud_probability /= normalizer partial_results.append({'question_id': question_id, 'duration': duration, 'answer': answer, 'fraud_p': fraud_probability, 'nonfraud_p': nonfraud_probability}) return partial_results score_for_user(87397087779) answers.duration[answers.duration < 10].round(0).value_counts() all_scores = [] for u in answers.user_id.unique(): all_scores.append(score_for_user(u)) plt.hist([d[-1]['fraud_p'] for d in all_scores]) plt.xlabel('Final probability of fraud') plt.ylabel('Number of people') #Looking at some example from pprint import pprint def get_instance(collection, lower, upper, function): g = list(c for c in collection if lower <= function(c) <= upper) if g: return g[random.randint(0, len(g) - 1)] get_fraud_p = lambda c: c[-1]['fraud_p'] print('Good surveys') short_questions = {1: "First name", 2: "People with your name honest?", 3: "Earliest political memory?", 4: "Men or women need more exercise?", 5: "What country do you live in?", 6: "Allocating money to different departments", 7: "How saw would you be if various plants went away?", 8: "What animal would you not want to leave with a sheep?", 9: "10 kids, 1 evil kid, 0 kids, or 2 bad kids?", 10: "Do you have any idea what the word 'Telluride' means?", 11: "Who would your parents like?"} def to_table(answers): rows = ['{}{}{:0.3f}{:0.5f}' .format(short_questions[r['question_id']], r['answer'], r['duration'], r['fraud_p']) for r in answers] return ('\n' '\n' '\n{}\n' '
AnswerDurationFraud Probability
'.format('\n'.join(rows))) print('Good table\n', to_table(get_instance(all_scores, .0, .1, get_fraud_p))) print('Bad table\n', to_table(get_instance(all_scores, .9, 1, get_fraud_p))) pdf_data = {'range': [0, 120], 'step_size': .25} pdf_values = {} x = np.arange(pdf_data['range'][0], pdf_data['range'][1], pdf_data['step_size']) for i in range(1, 12): fraud_kde = fraudy_timings[i] legit_kde = legit_timings[i] pdf_values[i] = {'legit': list(legit_kde.evaluate(x)), 'fraudy': list(fraud_kde.evaluate(x))} pdf_data['values'] = pdf_values json.dump(pdf_data, open('fraud_model_pdf.json', 'wt'))