In [1]:
import json
DATA_PATH = 'cleaned_data.json'
all_data = json.load(open(DATA_PATH))
In [2]:
import pandas as pd
import numpy as np
answers = pd.DataFrame(all_data)
answers['duration'] = answers.submit_time - answers.srv_time
answers.user_id = answers.user_id.astype(np.int)

# Clamp response time to 2 minutes.
answers.duration[answers.duration > 120] = 120

num_answered = answers.user_id.value_counts()
completions = num_answered[num_answered > 10].index
answers = answers[answers.user_id.isin(completions)]

# Some of the survey experiments did not get many responses, filter them out
source_counts = answers['type'].value_counts()
In [3]:
# So like 3 people completed surveys from twitter :)...
source_counts
Out[3]:
Mechanical Turk 5 Cents               4984
Turk, asking for Fast                 2597
Facebook                               198
Mechanical Turk 2 Cents                143
Twitter                                 33
Mechanical Turk High Skill Request      11
AdWords, asking for 2 cents.            11
Tiffany                                 11
dtype: int64
In [4]:
answers[0:3]
Out[4]:
answer city country_name ip_hash latitude longitude question_id region_name srv_time submit_time survey_order_id type user_id version duration
9 Obama speaking at his inauguration. Visalia United States 3412077616522023084 36.2958 -119.3812 3 California 1.389912e+09 1.389912e+09 21ejkavsdh Mechanical Turk High Skill Request 42403892847 0.1 38.466112
10 I think they both should exercise equally. Visalia United States 3412077616522023084 36.2958 -119.3812 4 California 1.389912e+09 1.389912e+09 21ejkavsdh Mechanical Turk High Skill Request 42403892847 0.1 25.887507
11 USA Visalia United States 3412077616522023084 36.2958 -119.3812 5 California 1.389912e+09 1.389912e+09 21ejkavsdh Mechanical Turk High Skill Request 42403892847 0.1 4.623164
In [5]:
answers = answers[answers.type.isin(source_counts[source_counts > 100].index)]
In [6]:
import numpy as np
grouped = answers.groupby(['type', 'question_id'])['duration'].agg({'mean': np.mean, 'count': len, 'std': np.std})
grouped
Out[6]:
count std mean
type question_id
Facebook 1 18 6.528073 10.140472
2 18 22.227931 23.996327
3 18 30.022063 37.650166
4 18 28.509485 34.013478
5 18 3.045837 6.149543
6 18 18.436770 42.643649
7 18 25.016560 40.394385
8 18 10.808506 18.092980
9 18 17.671644 26.297783
10 18 17.864162 18.963150
11 18 22.103053 35.284498
Mechanical Turk 2 Cents 1 13 22.553344 15.914699
2 13 9.001608 22.677917
3 13 31.036975 41.914349
4 13 30.803747 28.500610
5 13 6.284513 8.555612
6 13 13.852425 34.589414
7 13 16.696772 28.217106
8 13 10.241031 16.047161
9 13 11.543667 21.571579
10 13 19.762374 20.000313
11 13 28.408319 27.910834
Mechanical Turk 5 Cents 1 453 12.450826 14.048011
2 454 18.235401 23.120424
3 453 24.965651 36.441400
4 453 27.328623 34.381162
5 453 7.824829 8.795530
6 453 21.744677 41.797269
7 453 21.407686 35.237853
8 453 17.072572 19.110877
9 453 26.264251 32.945017
10 453 23.375196 25.831645
11 453 18.884085 29.102843
Turk, asking for Fast 1 236 11.803632 10.964369
2 237 13.104876 15.321082
3 236 21.295649 26.113626
4 236 19.836304 21.603866
5 236 9.395384 7.354463
6 236 19.971123 33.152513
7 236 18.643996 27.216647
8 236 7.632557 12.295015
9 236 18.172701 20.654331
10 236 16.994212 15.822472
11 236 12.158383 22.395870
In [7]:
questions = json.load(open('../app/survey.json'))
text_by_id = {q['id']: q['question'] for q in questions['questions']}

for q in questions['questions']:
    if 'prompt' not in q:
        continue
    answers.answer[(answers.question_id == q['id']) & (answers['answer'] ==  q['prompt'])] = 'DEFAULT'
In [8]:
print(answers['type'].unique())
answers['type'].value_counts()
['Turk, asking for Fast' 'Mechanical Turk 2 Cents' 'Facebook'
 'Mechanical Turk 5 Cents']
Out[8]:
Mechanical Turk 5 Cents    4984
Turk, asking for Fast      2597
Facebook                    198
Mechanical Turk 2 Cents     143
dtype: int64

I'm just going to work on finding the timing differences between the 5 cent turk answers and the "Do it fast" turk answers.

In [9]:
%pylab inline

import matplotlib.pyplot as plt

def plot_single_question(question_id, survey_answers, answer_types,
                         normalize=False, ax=None):
    durations = [survey_answers[(survey_answers.type == t) &
                                (survey_answers.question_id == question_id)].duration
                 for t in answer_types]
    if normalize:
        weights = [np.ones_like(d) / len(d) for d in durations]           
    else:
        weights = None
    
    plt.hist(durations, label=list(answer_types), weights=weights)
    ax.legend(bbox_to_anchor=(1.7, .95))
    title(text_by_id[question_id][:80] + '  (question {})'.format(question_id))
    

def plot_all_questions(survey_answers, answer_types, question_ids, normalize=False):
    size = 5
    plt.figure(figsize=(size, len(question_ids) * size))
    for (i, q) in enumerate(question_ids):
        ax = plt.subplot(len(question_ids), 1, i + 1)
        
        plot_single_question(q, survey_answers, answer_types, normalize=normalize, ax=ax)
Populating the interactive namespace from numpy and matplotlib
/home/justinvf/anaconda/envs/blogging/lib/python3.3/site-packages/matplotlib/mathtext.py:46: UserWarning: Due to a bug in pyparsing <= 2.0.0 on Python 3.x, packrat parsing has been disabled.  Mathtext rendering will be much slower as a result.  Install pyparsing 2.0.0 or later to improve performance.
  warn("Due to a bug in pyparsing <= 2.0.0 on Python 3.x, packrat parsing "
In [10]:
plot_all_questions(answers[answers.duration < 50],
                   ['Turk, asking for Fast', 'Mechanical Turk 5 Cents'],
                   list(1 + i for i in range(11)),
                   normalize=True)
In [11]:
from scipy.stats import gaussian_kde
(fraudy, legit) = ('Turk, asking for Fast', 'Mechanical Turk 5 Cents')

fraudy_timings = {}
legit_timings = {}
for i in range(11):
    q_id = i + 1
    q_frame = answers.duration[answers.question_id == q_id]
    fraudy_timings[q_id] = gaussian_kde(q_frame[answers.type == fraudy])
    legit_timings[q_id] = gaussian_kde(q_frame[answers.type == legit])
In [12]:
SIZE = 5
NUM_QUESTIONS = len(legit_timings)
plt.figure(figsize=(SIZE, SIZE * NUM_QUESTIONS))

for i in sorted(fraudy_timings):
    plt.subplot(NUM_QUESTIONS, 1, i)
    title(text_by_id[i][:80] + '  (question {})'.format(i))
    fraud_kde = fraudy_timings[i]
    legit_kde = legit_timings[i]
    x = np.arange(0, 120, .1)
    plt.yticks([])
    plt.xlabel('Seconds to answer')
    plt.plot(x, fraud_kde.evaluate(x), 'r', label='more fraudulent')
    plt.plot(x, legit_kde.evaluate(x), 'g', label='more good')
    plt.legend()

Here the start of digging into some of the actual answer data, just for fun.

In [13]:
import re
political_regex = re.compile('.*(obama|jfk|kennedy|ronald|reagan|regan|clinton|bill cl|'
                             'george washington|george w|dukakis|saddam|'
                             'bush|carter|nixon|modi|gorbachev|lincoln|trudeau|'
                             'brezhnev|perot|'
                             'mahatma gandhi|nehru|gingrich|martin luther king|mlk|'
                             'rajiv gandhi|ford|rajive gandhi|eisenhower|'
                             'rahul gandhi|indira gandhi|nelson mandela|white house|'
                             'gandhi|thatcher).*', re.IGNORECASE)
answers['figure'] = answers[answers.question_id == 3]['answer'].str.match(political_regex)

def get_first(l):
    if type(l) == tuple and l:
        return l[0].lower()

answers.figure_clean = answers.figure.apply(get_first)
In [14]:
answers.figure_clean.value_counts()
Out[14]:
clinton               102
bush                   80
obama                  59
reagan                 47
gandhi                 41
nixon                  20
modi                   20
kennedy                18
white house            15
regan                  12
carter                 10
nehru                   7
eisenhower              6
jfk                     6
nelson mandela          5
ronald                  4
thatcher                4
gorbachev               3
george washington       3
trudeau                 3
martin luther king      3
perot                   3
ford                    2
dukakis                 2
lincoln                 2
george w                1
saddam                  1
bill cl                 1
gingrich                1
brezhnev                1
dtype: int64
In [15]:
# Someone famous in India who I had no clue about:
answers.answer[answers.figure_clean == 'modi']
Out[15]:
445                                         narendra modi
753                                                  modi
839               narendra modi speaking at a conferrence
946                         Narendra modi speech in stage
1093                                        narendra modi
1352                                       Narendran modi
1358                                         NARENRA MODI
1967                        modi spoken at mumbai grounds
2420                                                 modi
2560                                    narendra modi\r\n
2586                        Narendra Modi speech at Bihar
3192                                        narendra modi
3663                                        Narendra Modi
4619                                     MR.NARENDRA MODI
5286             While in past I heard the speach of Modi
5586                 narendra modi is speaking at gujarat
6009    Narendra modi announced as a Prime minister ca...
8383                    Modi spoken about Tea Shop worker
8527                                        Narendra modi
8548                                Modi speaking in dias
Name: answer, dtype: object
In [16]:
# Find the folks not captured by that regex:
def empty_tuple(x):
    return type(x) == type([]) and not x

list(answers[answers.figure.apply(empty_tuple) & (answers.answer != 'DEFAULT')].answer)
Out[16]:
['yes',
 'ARVIND KEJRIWAL',
 'The president Salinas giving an speech on tv',
 'Quite moderate.',
 'Mr.karunanudhi addressing a speech in tamil nadu',
 'suresh',
 "'95",
 'speak about economy',
 'i hate politics',
 'congressman speaking at school',
 'cant recall',
 '"Read my lips. No new taxes."',
 'moderatly coserevative',
 'idk',
 'Rajai speaking for DMK',
 '5',
 'Voting',
 'BEHAVIOUR',
 'vijaykanth',
 'Delhi election',
 'full of problems and pressure',
 'liberally',
 '',
 'i saw one of them on tv.',
 'DR.MANMOHAN SINGH SMILING AT SOMEONE IN THE AUDIENCE',
 'prefer not to say',
 ' My earliest memory is of the angry bullies who lived next door to us. ... anger he channeled toward political figures was rooted in something other than that',
 'prathipa patel',
 'Mahatma speaking in front of a crowd which I saw as a video on Tv',
 'Abdul Kalam',
 'no idea',
 'election',
 'subash chandra bos',
 'manmohan singh (Indian prime minister) speaking in his 3rd ever conference in the last 2 decades.',
 'Abdul Kalam',
 'Flag hoisting',
 'MGR ',
 'a meeting',
 'Arvind k in rajya sabha',
 'Prime Minister taking action',
 'NIL',
 'Moi',
 'Sheikh Hasina',
 'C P Muhammed MLA speaking in Pattambi',
 'Conference',
 'bharatiya janata party',
 'J Jayalalitha meet in my hometown.',
 'none',
 'sonia',
 'ntng',
 'usa',
 'Waste',
 'M.G.R',
 'LITTLE',
 'no idea',
 'Atal Bihari Vajpayee speaking at BJP party office',
 "Rahul's campaign speech ",
 'obema',
 'ana hazare speking of black mone',
 'aravind kejrival',
 'A P J Abdul Kalam',
 'Karunanithy',
 'Mr.karunanidhi, Former Chief Minister of Tamil Nadu speaking in a public meeting',
 'I AM NOT INTRESTED IN THAT',
 'attending a governors speech at age 8 or 9',
 'Smith\r\n',
 'abdul kalam',
 '',
 'Kejariwal becomes chief minister of Delhi',
 'SASI THAROOR',
 'Lyndon Johnson on the TV news',
 'senate',
 'Nothing',
 'i hate politics',
 '',
 'Aravind Kejriwal speech at Parliament',
 'Recent activity about devayani in the visa case',
 'mick ',
 'english',
 'jayalalitha ruling',
 '8',
 'None',
 'I have no idea.',
 'kejriwal became mp.',
 'Aravind kejarival won election at delhi',
 'bjp',
 'Regean being in trouble for Iran-Contra',
 'london',
 'My earliest memory of a political figure is vijayakandh spoke at Assembly.',
 'Good management',
 'hard to collect, but the one speach given by Atal Bihari was unforgetable.',
 'yes\r\n',
 '',
 'dishonesty',
 'nothing',
 'politics is simply like a useless material which is handed over to old guy to damage even more.',
 'LIBERAL',
 'chicago park, never',
 'benezir bhutto killed during campaign',
 'ldf',
 'Cris Daly of the SF Board of Supervisors cursing someone out.',
 'memorial services',
 'america',
 '',
 '4th grade',
 'aam aadmi',
 'none',
 'Indian primeminister Atal Bihai Vajpayee speech',
 'I DONT LIKE POLITICS',
 'speaking to parliment during question time',
 'Mahatha ghandhi biography',
 'Learning that James Bulger was the Prime Minister of New Zealand in 1992.',
 'As a little kid, wondering why Jean Chretien had a sideways mouth and talked sideways, after seeing him on tv.',
 'Abraham Lincon',
 'Pres. Johnson on TV when I was a young child, discussing Vietnam. . ',
 'The past leader of Malaysia, Tun Doktor Mahathir.',
 'interview',
 'My stepdad being upset over RFK being killed,',
 'Medical camp of TDP on NTR annual day.',
 "I was sitting in the waiting room of a VA Hospital.  I must have been 5 or younger.  I just remember some politician talking about China.  I'm not sure, but I think it was communist conversion.  Really vague memory.",
 'not interested',
 'hema',
 'Nothing',
 'Manmohan singh as finance minister',
 'english',
 'nil',
 'n/a',
 'i dont',
 'Chandra babu speaking in assembly',
 '0',
 '1978',
 'AAP speech',
 'I remember the italian President, Oscar Luigi Scalfaro',
 '9',
 'adal bihari vaajpaayi is prime minister',
 '7',
 'jayalalitha',
 'nothing',
 'Seeing IKE on tv as a small boy',
 'martin ',
 'nothing',
 'America, Work',
 'None',
 "I don't remember who it was exactly, but the politician who killed himself live on TV.",
 'Central election party AAP',
 'vajpai',
 'Eating bread',
 "Don't care",
 'liberal ',
 'nice',
 'reagon',
 '48',
 'High school',
 '1994 elections',
 'CKINTON',
 'The first president',
 "Don't have any not really into watching or reading anything political.",
 'None.',
 'when i was 14',
 'washington',
 'Bill getting his "surprise" ',
 'today and yesterday\r\n',
 'whitehouse',
 "Don't know",
 'kalainger',
 'infuencial',
 'na',
 "Clint's big scandal",
 'na',
 'john',
 'zulfiqar bhutto',
 'Our leader giving lecture instead of doing work.',
 'Atal bihari vajpayee speech at parliament',
 'Raegan Breaking down wall.',
 'When I was in 2nd grade.',
 'center',
 'JEYA LALITHA ',
 'our C.M is introducing lot of welfare to people',
 'BJP winning in India',
 'mani',
 'manmohanshing',
 'No idea',
 'Hitler',
 'karunanithi',
 'regarding political instability',
 'California regarding the financial cresis',
 'watching the political debates in tenth grade',
 'Kalaignar  many speeches in political',
 'miss.j. jeyalalitha provided laptop to school and college students',
 'the president',
 'my sister',
 'Chandra Babu naidu winning the electoins',
 'nope',
 'waste. I hate democracy',
 'an Italian politician',
 "congress leader's murder",
 'arvind kejriwal',
 'Attal Bihari vajpeyi visited the Tsunami affected areas.']
In [17]:
def score_for_user(user_id, initial_fraud_probability=.1):
    
    fraud_probability = initial_fraud_probability
    nonfraud_probability = 1 - initial_fraud_probability
    the_data = answers[answers.user_id == user_id].sort(columns=('question_id',))[[
        'question_id', 'answer', 'duration']]
    partial_results = []
    
    for r in the_data.iterrows():
        (question_id, answer, duration) = r[1]
        fraud_likelihood = fraudy_timings[question_id].evaluate(duration)[0]
        fraud_probability *= fraud_likelihood
        
        nonfraud_likelihood = legit_timings[question_id].evaluate(duration)[0]
        nonfraud_probability *= nonfraud_likelihood
        
        normalizer = nonfraud_probability + fraud_probability
        
        fraud_probability /= normalizer
        nonfraud_probability /= normalizer
        
        partial_results.append({'question_id': question_id,
                                'duration': duration,
                                'answer': answer,
                                'fraud_p': fraud_probability,
                                'nonfraud_p': nonfraud_probability})
    
    return partial_results
In [18]:
score_for_user(87397087779)
Out[18]:
[{'answer': 'DEFAULT',
  'duration': 17.01549005508423,
  'fraud_p': 0.078448550659764929,
  'nonfraud_p': 0.92155144934023514,
  'question_id': 1},
 {'answer': 'know',
  'duration': 16.06438899040222,
  'fraud_p': 0.078327530808187787,
  'nonfraud_p': 0.92167246919181234,
  'question_id': 2},
 {'answer': 'yes',
  'duration': 7.856693983078003,
  'fraud_p': 0.1601986332351793,
  'nonfraud_p': 0.83980136676482064,
  'question_id': 3},
 {'answer': 'no',
  'duration': 17.965824127197266,
  'fraud_p': 0.18695987381608123,
  'nonfraud_p': 0.81304012618391885,
  'question_id': 4},
 {'answer': 'ethiopia',
  'duration': 11.613417863845825,
  'fraud_p': 0.14424050342554573,
  'nonfraud_p': 0.85575949657445438,
  'question_id': 5},
 {'answer': 'immigration:1 transportation:1 healthcare:3 education:3 warfare:2',
  'duration': 37.87310600280762,
  'fraud_p': 0.13353764904151283,
  'nonfraud_p': 0.86646235095848712,
  'question_id': 6},
 {'answer': 'radish:2 lettuce:3 eggplant:6 tomato:6 aubergine:2 kiwi:8',
  'duration': 38.65355896949768,
  'fraud_p': 0.082402883940254051,
  'nonfraud_p': 0.91759711605974603,
  'question_id': 7},
 {'answer': 'tiger',
  'duration': 17.080639123916626,
  'fraud_p': 0.060961446981524434,
  'nonfraud_p': 0.93903855301847561,
  'question_id': 8},
 {'answer': '2 bad kids',
  'duration': 16.7669038772583,
  'fraud_p': 0.082194120577559093,
  'nonfraud_p': 0.91780587942244085,
  'question_id': 9},
 {'answer': 'no',
  'duration': 14.701430082321167,
  'fraud_p': 0.091859498736522979,
  'nonfraud_p': 0.90814050126347701,
  'question_id': 10},
 {'answer': 'Nabokov:1 Obama:4 Fidel Castro:1 Your favorite TV host:3 Babe Ruth:1',
  'duration': 32.44860100746155,
  'fraud_p': 0.062823282273206923,
  'nonfraud_p': 0.93717671772679312,
  'question_id': 11}]
In [19]:
answers.duration[answers.duration < 10].round(0).value_counts()
Out[19]:
8     322
7     317
6     305
9     277
5     268
4     172
10    129
3      78
1      45
2      37
0       4
dtype: int64
In [20]:
all_scores = []
for u in answers.user_id.unique():
    all_scores.append(score_for_user(u))

plt.hist([d[-1]['fraud_p'] for d in all_scores])
plt.xlabel('Final probability of fraud')
plt.ylabel('Number of people')
Out[20]:
<matplotlib.text.Text at 0x7f2b58c9d810>
In [21]:
#Looking at some example

from pprint import pprint

def get_instance(collection, lower, upper, function):
    g = list(c for c in collection if lower <= function(c) <= upper)
    if g:
        return g[random.randint(0, len(g) - 1)]

get_fraud_p = lambda c: c[-1]['fraud_p']

print('Good surveys')
short_questions = {1: "First name",
 2: "People with your name honest?",
 3: "Earliest political memory?",
 4: "Men or women need more exercise?",
 5: "What country do you live in?",
 6: "Allocating money to different departments",
 7: "How saw would you be if various plants went away?",
 8: "What animal would you not want to leave with a sheep?",
 9: "10 kids, 1 evil kid, 0 kids, or 2 bad kids?",
 10: "Do you have any idea what the word 'Telluride' means?",
 11: "Who would your parents like?"}

def to_table(answers):
    rows = ['<tr><td>{}</td><td>{}</td><td>{:0.3f}</td><td>{:0.5f}</td></tr>'
            .format(short_questions[r['question_id']], r['answer'], r['duration'], r['fraud_p'])
            for r in answers]
    return ('<table>\n'
            '<th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th>\n'
            '\n{}\n'
            '</table>'.format('\n'.join(rows)))
            


print('Good table\n', to_table(get_instance(all_scores, .0, .1, get_fraud_p)))
print('Bad table\n', to_table(get_instance(all_scores, .9, 1, get_fraud_p)))
Good surveys
Good table
 <table>
<th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th>

<tr><td>First name</td><td>Satia</td><td>6.209</td><td>0.11934</td></tr>
<tr><td>People with your name honest?</td><td>As much as anyone with any other name.</td><td>22.490</td><td>0.09134</td></tr>
<tr><td>Earliest political memory?</td><td>Nixon being impeached.</td><td>27.930</td><td>0.07606</td></tr>
<tr><td>Men or women need more exercise?</td><td>I think they need the same amount.</td><td>20.970</td><td>0.07745</td></tr>
<tr><td>What country do you live in?</td><td>US</td><td>5.400</td><td>0.07367</td></tr>
<tr><td>Allocating money to different departments</td><td>immigration:2 healthcare:3 education:3 warfare:0 transportation:2</td><td>31.218</td><td>0.07538</td></tr>
<tr><td>How saw would you be if various plants went away?</td><td>radish:3 lettuce:10 eggplant:6 tomato:10 aubergine:6 kiwi:3</td><td>27.000</td><td>0.07947</td></tr>
<tr><td>What animal would you not want to leave with a sheep?</td><td>A wolf</td><td>13.068</td><td>0.09462</td></tr>
<tr><td>10 kids, 1 evil kid, 0 kids, or 2 bad kids?</td><td>I already have 3 children but I definitely don't want anymore.</td><td>22.825</td><td>0.09188</td></tr>
<tr><td>Do you have any idea what the word 'Telluride' means?</td><td>It's something doing with science and a metal, I think, if I remember correctly.</td><td>39.526</td><td>0.03235</td></tr>
<tr><td>Who would your parents like?</td><td>Nabokov:2 Obama:4 Fidel Castro:1 Your favorite TV host:1 Babe Ruth:1</td><td>26.100</td><td>0.02641</td></tr>
</table>
Bad table
 <table>
<th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th>

<tr><td>First name</td><td>Natalie</td><td>6.862</td><td>0.11502</td></tr>
<tr><td>People with your name honest?</td><td>Yes, they are.</td><td>8.962</td><td>0.17751</td></tr>
<tr><td>Earliest political memory?</td><td>President Ronald Reagan speaking on television.</td><td>20.161</td><td>0.19511</td></tr>
<tr><td>Men or women need more exercise?</td><td>Men over 50 need about the same amount of exercise as women over 45.</td><td>17.454</td><td>0.23138</td></tr>
<tr><td>What country do you live in?</td><td>USA</td><td>4.092</td><td>0.25967</td></tr>
<tr><td>Allocating money to different departments</td><td>immigration:1 healthcare:3 education:5 warfare:0 transportation:2</td><td>13.936</td><td>0.41175</td></tr>
<tr><td>How saw would you be if various plants went away?</td><td>radish:9 lettuce:10 eggplant:6 tomato:7 aubergine:6 kiwi:5</td><td>18.782</td><td>0.51566</td></tr>
<tr><td>What animal would you not want to leave with a sheep?</td><td>A wolf</td><td>5.541</td><td>0.68619</td></tr>
<tr><td>10 kids, 1 evil kid, 0 kids, or 2 bad kids?</td><td>0 kids</td><td>5.290</td><td>0.80988</td></tr>
<tr><td>Do you have any idea what the word 'Telluride' means?</td><td>No, never heard of it</td><td>6.407</td><td>0.88554</td></tr>
<tr><td>Who would your parents like?</td><td>Nabokov:1 Obama:1 Fidel Castro:1 Your favorite TV host:2 Babe Ruth:3</td><td>15.959</td><td>0.92446</td></tr>
</table>
In [22]:
pdf_data = {'range': [0, 120],
            'step_size': .25}
pdf_values = {}

x = np.arange(pdf_data['range'][0], pdf_data['range'][1], pdf_data['step_size'])

for i in range(1, 12):
    fraud_kde = fraudy_timings[i]
    legit_kde = legit_timings[i]
    pdf_values[i] = {'legit': list(legit_kde.evaluate(x)),
                     'fraudy': list(fraud_kde.evaluate(x))}
pdf_data['values'] = pdf_values

json.dump(pdf_data, open('fraud_model_pdf.json', 'wt'))
In [ ]: