import requests
from bs4 import BeautifulSoup
from collections import defaultdict
def text_to_dict(paragraph_array, d, candidates):
'''takes an array of text paragraphs from debate and returns dict
where key is person and value is list of text spoken by that candidate'''
# just a default speaker that won't end up in our returned data
# will get replaced when an actual speaker is found
speaker = "<START>"
for paragraph in paragraph_array:
words = paragraph.text.split(' ')
first_word = words[0]
# only new speaker when have SPEAKER: format
if first_word[-1] == ":":
speaker = first_word[:-1]
# only keep candidates text
if speaker in candidates:
d[speaker].append(words[1:])
return d
def process_url(url, speaker_dict, candidates):
# requests gets the source code from the url and extracts it as text
html = requests.get(url).text
# beautifulsoup is a library that takes in text source code and returns a structured format of that
# source code that you can more easily search and parse.
soup = BeautifulSoup(html, 'html5lib')
# get all the 'p' tags from the source with class = 'story-body-text'
# this was determined by looking at the source code
# the first and last paragraphs are intro and ending
paragraphs = soup('p', {'class': 'story-body-text'})[1:-1]
text_to_dict(paragraphs, speaker_dict, candidates)
def process_url_list(urls, speaker_dict, candidates):
for url in urls:
process_url(url, speaker_dict, candidates)
candidates = ['BUSH', 'TRUMP', 'RUBIO', 'CARSON', 'FIORINA', 'KASICH', 'CRUZ', 'PAUL',
'SANDERS', 'CLINTON', "O’MALLEY"]
urls = ['http://www.nytimes.com/2015/11/11/us/politics/transcript-republican-presidential-debate.html',
'http://www.nytimes.com/2016/02/14/us/politics/transcript-of-the-republican-presidential-debate.html',
'http://www.nytimes.com/2016/01/15/us/politics/transcript-of-republican-presidential-debate.html',
'http://www.nytimes.com/2016/01/29/us/politics/republican-presidential-debate-transcript.html?_r=0',
'http://www.nytimes.com/2016/01/18/us/politics/transcript-of-the-democratic-presidential-debate.html',
'http://www.nytimes.com/2016/02/12/us/politics/transcript-of-the-democratic-presidential-debate-in-milwaukee.html',
'http://www.nytimes.com/2016/02/07/us/politics/transcript-of-the-republican-presidential-debate-in-new-hampshire.html',
'http://www.nytimes.com/2016/02/05/us/politics/transcript-of-the-democratic-presidential-debate.html',
'http://www.nytimes.com/2016/01/29/us/politics/republican-presidential-debate-preliminary-transcript.html',
'http://www.nytimes.com/2015/12/16/us/politics/transcript-main-republican-presidential-debate.html']
speaker_dict = defaultdict(list)
process_url_list(urls, speaker_dict, candidates)