In [1]:

import requests
from bs4 import BeautifulSoup
from collections import defaultdict


def text_to_dict(paragraph_array, d, candidates):
    '''takes an array of text paragraphs from debate and returns dict 
    where key is person and value is list of text spoken by that candidate'''
    # just a default speaker that won't end up in our returned data
    # will get replaced when an actual speaker is found
    speaker = "<START>"
    for paragraph in paragraph_array:
        words = paragraph.text.split(' ')
        first_word = words[0]
        # only new speaker when have SPEAKER: format
        if first_word[-1] == ":":
            speaker = first_word[:-1]
        # only keep candidates text
        if speaker in candidates:
            d[speaker].append(words[1:])
    return d


def process_url(url, speaker_dict, candidates):
    # requests gets the source code from the url and extracts it as text
    html = requests.get(url).text
    # beautifulsoup is a library that takes in text source code and returns a structured format of that
    # source code that you can more easily search and parse.
    soup = BeautifulSoup(html, 'html5lib')
    # get all the 'p' tags from the source with class = 'story-body-text'
    # this was determined by looking at the source code
    # the first and last paragraphs are intro and ending
    paragraphs = soup('p', {'class': 'story-body-text'})[1:-1]
    text_to_dict(paragraphs, speaker_dict, candidates)
    
    
def process_url_list(urls, speaker_dict, candidates):
    for url in urls:
        process_url(url, speaker_dict, candidates)
    
    
candidates = ['BUSH', 'TRUMP', 'RUBIO', 'CARSON', 'FIORINA', 'KASICH', 'CRUZ', 'PAUL',
             'SANDERS', 'CLINTON', "O’MALLEY"]
urls = ['http://www.nytimes.com/2015/11/11/us/politics/transcript-republican-presidential-debate.html',
       'http://www.nytimes.com/2016/02/14/us/politics/transcript-of-the-republican-presidential-debate.html',
       'http://www.nytimes.com/2016/01/15/us/politics/transcript-of-republican-presidential-debate.html',
       'http://www.nytimes.com/2016/01/29/us/politics/republican-presidential-debate-transcript.html?_r=0',
       'http://www.nytimes.com/2016/01/18/us/politics/transcript-of-the-democratic-presidential-debate.html',
       'http://www.nytimes.com/2016/02/12/us/politics/transcript-of-the-democratic-presidential-debate-in-milwaukee.html',
       'http://www.nytimes.com/2016/02/07/us/politics/transcript-of-the-republican-presidential-debate-in-new-hampshire.html',
       'http://www.nytimes.com/2016/02/05/us/politics/transcript-of-the-democratic-presidential-debate.html',
       'http://www.nytimes.com/2016/01/29/us/politics/republican-presidential-debate-preliminary-transcript.html',
       'http://www.nytimes.com/2015/12/16/us/politics/transcript-main-republican-presidential-debate.html']
speaker_dict = defaultdict(list)

process_url_list(urls, speaker_dict, candidates)

/Users/tylerfolkman/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:19: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal

In [2]:

def pargraphs_to_words(speaker_dict):
    d = defaultdict(list)
    for candidate, paragraphs in speaker_dict.items():
        for paragraph in paragraphs:
            for word in paragraph:
                d[candidate].append(word)
    return d

In [3]:

candidate_words = pargraphs_to_words(speaker_dict)

In [4]:

import json

with open('candidate_words_dict.json', 'w') as f:
    json.dump(candidate_words, f)