""" Get 'Science or Fiction' data from The Skeptics' Guide to the Universe website. """ import urllib2 import contextlib from time import sleep from bs4 import BeautifulSoup # put data in dictionary, counts = {episode_num: fiction_num, ...} counts = {} # first episode with science or fiction info is episode 43. for i in range(43,481): url = "http://www.theskepticsguide.org/podcast/sgu/%s" % i with contextlib.closing(urllib2.urlopen(url)) as response: html = response.read() soup = BeautifulSoup(html) scifi_title = None # find the 'Science or Fiction' section of the page for title in soup.find('div', {'class': 'podcast-detail'}).find_all('h3'): if 'Science or Fiction' in title.text: scifi_title = title # not all episodes have a round of scifi if scifi_title: # get the list of news items from section scifi_items = scifi_title.find_next('ul').select('li') # only want to look at scifis with 3 items. if len(scifi_items) == 3: for item in scifi_items: split_item = item.select('span') # get the item that is fiction and increment number if split_item[1].text.strip().lower() == 'fiction': item_number = split_item[0].text.split('#')[1].strip() try: item_number = int(item_number) counts[i] = item_number except ValueError: print "ValueError with episode %s!" % i continue # be very gentle on the website sleep(5) # count the number of times items 1, 2 and 3 were fiction. values = counts.values() n1 = values.count(1) n2 = values.count(2) n3 = values.count(3) # probability total = float(len(values)) p1 = n1 / total p2 = n2 / total p3 = n3 / total print n1, n2, n3 print p1, p2, p3 # make a little plot import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline mpl.rcParams['savefig.dpi'] = 100 # make the graph display nicer ind = [0] plt.figure(figsize=(2.5,5)) plot1 = plt.bar(ind, p1, 0.2, color='#66c2a5') # colors from http://colorbrewer2.org/ plot2 = plt.bar(ind, p2, 0.2, color='#fc8d62', bottom=p1) plot3 = plt.bar(ind, p3, 0.2, color='#8da0cb', bottom=p1+p2) # remove x ticks plt.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off') plt.ylabel('Distribution of Fiction') plt.legend((plot3, plot2, plot1), ('Item 3', 'Item 2', 'Item 1')) plt.show() # yeah, that plot isn't useful at all # do a chi squared test to see if the differences are actually significant from scipy import stats observations = [n1, n2, n3] chisq, p = stats.chisquare(observations) print "ChiSq: %s \n P: %s" % (chisq, p)