In [1]:
""" Get 'Science or Fiction' data from The Skeptics' Guide to the Universe website. """
import urllib2
import contextlib
from time import sleep

from bs4 import BeautifulSoup

# put data in dictionary, counts = {episode_num: fiction_num, ...}
counts = {}

# first episode with science or fiction info is episode 43.
for i in range(43,481):
    url = "http://www.theskepticsguide.org/podcast/sgu/%s" % i
    
    with contextlib.closing(urllib2.urlopen(url)) as response:
        html = response.read()

    soup = BeautifulSoup(html)
    scifi_title = None
    
    # find the 'Science or Fiction' section of the page
    for title in soup.find('div', {'class': 'podcast-detail'}).find_all('h3'):
        if 'Science or Fiction' in title.text:
            scifi_title = title

    # not all episodes have a round of scifi
    if scifi_title:
        # get the list of news items from section
        scifi_items = scifi_title.find_next('ul').select('li')

        # only want to look at scifis with 3 items.
        if len(scifi_items) == 3:
            
            for item in scifi_items:
                split_item = item.select('span')

                # get the item that is fiction and increment number
                if split_item[1].text.strip().lower() == 'fiction':
                    item_number = split_item[0].text.split('#')[1].strip()
                    
                    try:
                        item_number = int(item_number)
                        counts[i] = item_number
                    except ValueError:
                        print "ValueError with episode %s!" % i
                        continue

    # be very gentle on the website
    sleep(5)
ValueError with episode 247!
In [2]:
# count the number of times items 1, 2 and 3 were fiction.
values = counts.values()
n1 = values.count(1)
n2 = values.count(2)
n3 = values.count(3)

# probability
total = float(len(values))
p1 = n1 / total
p2 = n2 / total
p3 = n3 / total

print n1, n2, n3
print p1, p2, p3
128 119 133
0.336842105263 0.313157894737 0.35
In [3]:
# make a little plot
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

mpl.rcParams['savefig.dpi'] = 100 # make the graph display nicer

ind = [0]

plt.figure(figsize=(2.5,5))
plot1 = plt.bar(ind, p1, 0.2, color='#66c2a5') # colors from http://colorbrewer2.org/
plot2 = plt.bar(ind, p2, 0.2, color='#fc8d62', bottom=p1)
plot3 = plt.bar(ind, p3, 0.2, color='#8da0cb', bottom=p1+p2)

# remove x ticks
plt.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')

plt.ylabel('Distribution of Fiction')
plt.legend((plot3, plot2, plot1), ('Item 3', 'Item 2', 'Item 1'))
plt.show()
In [4]:
# yeah, that plot isn't useful at all

# do a chi squared test to see if the differences are actually significant
from scipy import stats

observations = [n1, n2, n3]
chisq, p = stats.chisquare(observations)

print "ChiSq: %s \n    P: %s" % (chisq, p)
ChiSq: 0.794736842105 
    P: 0.672086369247