%pylab inline

figsize( 12.5, 5 )
import pymc as mc

sample_size = 100000
expected_value = lambda_ = 4.5
poi = mc.rpoisson
N_samples = range(1,sample_size,100)

for k in range(3):

    samples = poi( lambda_, size = sample_size ) 
    
    partial_average = [ samples[:i].mean() for i in N_samples ]
    
    plt.plot( N_samples, partial_average, lw=1.5,label="average \
    of  $n$ samples; seq. %d"%k)
    

plt.plot( N_samples, expected_value*np.ones_like( partial_average), \
    ls = "--", label = "true expected value", c = "k" )

plt.ylim( 4.35, 4.65) 
plt.title( "Convergence of the average of \n random variables to its \
expected value" )
plt.ylabel( "average of $n$ samples" )
plt.xlabel( "# of samples, $n$")
plt.legend()

figsize( 12.5, 4)
N_Y = 250
D_N_results = [] 
N_array = np.arange( 0, 50000, 2500 )
lambda_ = 4.5
expected_value = 4.5

def D_N( n ):
    Z = poi( lambda_, size = (n, N_Y) )
    average_Z = Z.mean(axis=0)
    return np.sqrt( (  (average_Z - expected_value)**2  ).mean() )
    
    
for n in N_array:
    D_N_results.append( D_N(n) )


plt.xlabel( "$N$" )
plt.ylabel( "expected squared-distance from true value" )
plt.plot(N_array, D_N_results, lw = 1, 
            label="expected distance between\n\
expected value and \naverage of $N$ random variables.")
plt.plot( N_array, np.sqrt(expected_value)/np.sqrt(N_array), lw = 3, ls = "--", 
        label = r"$\frac{\sqrt{\lambda}}{\sqrt{N}}$" )
plt.legend()
plt.title( "How 'fast' is the sample average converging? " )

import pymc as mc

N = 10000
print np.mean( [ mc.rexponential( 0.5 )>10 for i in range(N) ] )

figsize( 12.5, 4) 
std_height = 15
mean_height = 150

n_counties = 5000
pop_generator = mc.rdiscrete_uniform
norm = mc.rnormal

#generate some artificial population numbers
population = pop_generator(100, 1500, size = n_counties )

average_across_county = np.zeros( n_counties )
for i in range( n_counties ):
    #generate some individuals and take the mean
    average_across_county[i] = norm(mean_height, 1./std_height**2,
                                        size=population[i] ).mean()
    

#where are the extreme populations?
i_min = np.argmin( average_across_county )
i_max = np.argmax( average_across_county )

#plot population vs. average
plt.scatter( population, average_across_county, alpha = 0.5, c="#7A68A6")
plt.scatter( [ population[i_min], population[i_max] ], 
           [average_across_county[i_min], average_across_county[i_max] ],
           s = 60, marker = "o", facecolors = "none",
           edgecolors = "#A60628", linewidths = 1.5, 
            label="extreme heights")

plt.xlim( 100, 1500 )
plt.title( "Average height vs. County Population")
plt.xlabel("County Population")
plt.ylabel("Average height in county")
plt.plot( [100, 1500], [150, 150], color = "k", label = "true expected \
height", ls="--" )
plt.legend(scatterpoints = 1)

print "Population sizes of 10 'shortest' counties: "
print population[ np.argsort( average_across_county )[:10] ]
print
print "Population sizes of 10 'tallest' counties: "
print population[ np.argsort( -average_across_county )[:10] ]

figsize( 12.5, 6.5 )
data = np.genfromtxt( "./data/census_data.csv", skip_header=1, 
                        delimiter= ",")
plt.scatter( data[:,1], data[:,0], alpha = 0.5, c="#7A68A6")
plt.title("Census mail-back rate vs Population")
plt.ylabel("Mail-back rate")
plt.xlabel("population of block-group")
plt.xlim(-100, 15e3 )
plt.ylim( -5, 105)

i_min = np.argmin(  data[:,0] )
i_max = np.argmax(  data[:,0] )

 
plt.scatter( [ data[i_min,1], data[i_max, 1] ], 
             [ data[i_min,0],  data[i_max,0] ],
             s = 60, marker = "o", facecolors = "none",
             edgecolors = "#A60628", linewidths = 1.5, 
             label="most extreme points")

plt.legend(scatterpoints = 1);


from IPython.core.display import Image
#adding a number to the end of the %run call with get the ith top photo.
%run top_pic_comments.py 2

Image(top_post_url)


"""
contents: an array of the text from all comments on the pic
votes: a 2d numpy array of upvotes, downvotes for each comment.
"""
n_comments = len(contents )
comments = np.random.randint( n_comments, size=4)
print "Some Comments (out of %d total) \n-----------"%n_comments
for i in comments:
    print '"' + contents[i] + '"'
    print"upvotes/downvotes: ",votes[i,:]
    print

import pymc as mc

def posterior_upvote_ratio( upvotes, downvotes, samples = 20000):
    N = upvotes + downvotes
    upvote_ratio = mc.Uniform( "upvote_ratio", 0, 1 )
    observations = mc.Binomial( "obs",  N, upvote_ratio, value = upvotes, observed = True)
    
    model = mc.Model( [upvote_ratio, observations ]) 
    map_ = mc.MAP(model).fit()
    mcmc = mc.MCMC(model)
    
    mcmc.sample(samples, samples/4)
    
    return mcmc.trace("upvote_ratio")[:]
    

figsize( 11., 8)
posteriors = []
colours = ["#348ABD", "#A60628", "#7A68A6", "#467821", "#CF4457"]
for i in range(len(comments)):
    j = comments[i]
    posteriors.append( posterior_upvote_ratio( votes[j, 0], votes[j,1] ) )
    plt.hist( posteriors[i], bins = 18, normed = True, alpha = .9, 
            histtype="step",color = colours[i%5], lw = 3,
            label = '(%d up:%d down)\n%s...'%(votes[j, 0], votes[j,1], contents[j][:50]) )
    plt.hist( posteriors[i], bins = 18, normed = True, alpha = .2, 
            histtype="stepfilled",color = colours[i], lw = 3, )
    #plt.title( 'upvotes:downvotes: %d:%d'%(votes[j,0], votes[j,1]) )

    
plt.legend(loc="upper left")
plt.xlim( 0, 1)
plt.title("Posterior distributions of upvote ratios on different comments");


N = posteriors[0].shape[0]
lower_limits = []

for i in range(len(comments)):
    j = comments[i]
    plt.hist( posteriors[i], bins = 20, normed = True, alpha = .9, 
            histtype="step",color = colours[i], lw = 3,
            label = '(%d up:%d down)\n%s...'%(votes[j, 0], votes[j,1], contents[j][:50]) )
    plt.hist( posteriors[i], bins = 20, normed = True, alpha = .2, 
            histtype="stepfilled",color = colours[i], lw = 3, )
    v = np.sort( posteriors[i] )[ int(0.05*N) ]
    #plt.vlines( v, 0, 15 , color = "k", alpha = 1, linewidths=3 )
    plt.vlines( v, 0, 10 , color = colours[i], linestyles = "--",  linewidths=3  )
    lower_limits.append(v)
    plt.legend(loc="upper left")

    
plt.legend(loc="upper left")

plt.title("Posterior distributions of upvote ratios on different comments");
order = argsort( -np.array( lower_limits ) )
print order, lower_limits

    
def intervals(u,d):
    a = 1. + u
    b = 1. + d
    mu = a/(a+b)
    std_err = 1.65*np.sqrt( (a*b)/( (a+b)**2*(a+b+1.) ) )
    return ( mu, std_err )


print "Approximate lower bounds:"
posterior_mean, std_err  = intervals(votes[:,0],votes[:,1])
lb = posterior_mean - std_err
print lb
print

print "Top 40 Sorted according to approximate lower bounds:"
print
order = np.argsort( -lb )
ordered_contents = []
for i in order[:40]:
    ordered_contents.append( contents[i] )
    print  votes[i,0], votes[i,1], contents[i]
    print "-------------"


r_order = order[::-1][:40]
plt.errorbar( posterior_mean[r_order], np.arange( len(r_order) ), 
               xerr=std_err[r_order],xuplims=True, capsize=0, fmt="o",
                color = "#7A68A6")
plt.xlim( 0.3, 1)
plt.yticks( np.arange( len(r_order)-1,-1,-1 ), map( lambda x: x[:30], ordered_contents) );


## Enter code here
import scipy.stats as stats
exp = stats.expon( scale=4 )
N = 1e5
X = exp.rvs( N )
## ...

from IPython.core.display import HTML
def css_styling():
    styles = open("../styles/custom.css", "r").read()
    return HTML(styles)
css_styling()