#!/usr/bin/env python
# coding: utf-8

# # The classic start, simple and easy in python

# In[1]:


print("Hello world!")


# # Easy to use as a calculator

# In[2]:


2+2


# # Python is a dynamic language, no need to declare variable types before assignment

# In[3]:


foo = 2   # this is a comment
bar = 2
foo+bar,foo-bar,foo*bar,foo/bar,foo**bar


# # There are floats, integers, strings, lists

# In[4]:


a = [2.01, 3, 'hello']   # lists can contain anything,
print(a, a[2], a[0:2], "<--- array slicing example")   # showing list/array slicing
print(a*2, "<--- multiplying a list")               # multiplying a list
a.append('goodbye')      # adding to a list
print(a, "<--- appending a list")


# # You can also import packages for useful code content, such as functions, constants, whatever

# In[5]:


import math   # the import function
print(math.pi)   # now you've got pi!


# In[6]:


from math import *   # another way to import things, but this method isn't recommended
print(pi)


# In[7]:


from numpy import pi as anotherpi  # the recommended way to import single functions
import numpy as np                 # import a package as a new name, this is the most common
print(anotherpi)
print(np.pi)


# # Numpy is a powerful pacakge that you'll use a lot

# In[8]:


print(np.zeros(10))        # make an empty array
print(np.ones(10))         # make an array of ones
b = np.arange(10)          # make an array of integer indices
print(b)


# In[9]:


print(b*b)                  # multiplying arrays of same size
print((b*b).reshape(2,5))   # you can also reshape arrays


# # numpy also has useful functions like mean, std 

# In[10]:


testmean=10
teststd=2
ntestpoints=100
# use numpy's random number sampling from a normal distribution (which has a sigma=1)
#  to generate some fake data
testdata=np.ones(ntestpoints)*testmean + np.random.randn(ntestpoints)*teststd
themean = np.sum(testdata)/ntestpoints   # calculating the mean without numpy
stderr = np.sqrt( 1./(ntestpoints-1) * np.sum( (np.mean(testdata)-testdata)**2) )   #calculating the std error
error_on_mean = stderr/(sqrt(ntestpoints))
print "The mean: "+ str(np.mean(testdata))+", or "+str(themean)    # our mean and numpy's mean should be the same
print "The standard error: " +str(np.std(testdata,ddof=1))+", or "+str(stderr)  #note the ddof=1
print "The error on the mean: " + str(error_on_mean)


# # Plotting is inside the matplotlib.pyplot package 

# In[11]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('pylab', 'inline')


# In[12]:


plt.figure(figsize=(10,6))
plt.hist(testdata,bins=ntestpoints/10,histtype='stepfilled',alpha=.5,color='g',
        range=[5,15])
plt.xlabel('Value',fontsize=20)
plt.ylabel('Number of measurements \n in each bin',fontsize=20)
plt.title('Histogram of test data',fontsize=22)
plt.axvline(themean,linestyle='-',color='r')
plt.axvline(themean+error_on_mean,linestyle='--',color='b')
plt.axvline(themean-error_on_mean,linestyle='--',color='b')
plt.axvline(testmean,color='k',linestyle='-')


# # You can also read in data from a text file with numpy's loadtxt function

# In[13]:


# Click on the menu for the graph on
# https://www.google.com/trends/explore?q=%2Fm%2F02896,%2Fm%2F05g3b,Super%20Bowl
# and go to "Download csv" to download this data, 
# and put its filename below

filename='/home/andrew/Downloads/cowboys.csv'   
# this data has four columns
#  date, cowboys searches, patriots searches, superbowl searches

# Use numpy's recfromcsv function to read in the csv file from Google
# another option is to use np.loadtxt
# (an example of that syntax is at the end of this notebook)
goog=np.recfromcsv(filename,skip_header=3,names=['week','cowboys','pats','supabowl'])
# the skip_header call is to skip a few lines at the top that aren't data
# and the names allow us to now reference the records using a variable name rather than integers

timedat=np.array([np.array(int(datedat.split('-')[0])+int(datedat.split('-')[1])/12.) for datedat in goog['week']])
#this translates the "2004-01" string into fractional years as a floating point number

plt.figure(figsize=(10,3))
for datname in ['cowboys','pats','supabowl']:
    plt.plot(timedat,goog[datname],label=datname,linewidth=2,alpha=.7)
plt.legend(loc='upper center',fontsize=10)
#plt.yscale('log')
plt.ylabel('Google interest [arb. units]',fontsize=12)
plt.xlabel('Time since 0 BCE [years]',fontsize=12)
plt.title('Hand-egg sports teams and their championships \n as measured through Google search interest')

plt.xticks(np.arange(2004,2018))
print ('how bout dem cowboys')

# put a vertical line for every patriots superbowl visit
patsup=[2004,2007,2011,2014] #years they went to the superbowl
for goodyear in patsup:  # loop over them 
    plt.axvline(goodyear+1.2,color='k')


# # You can define functions to operate on variables

# In[16]:


# start function definitions with the "def " and the name of the function
# inside the parentheses you place the variables which squared takes in
def squared(thingtosquare):
    """This is a documentation string for the function squared, which takes
    in a number (thingtosquare) and outputs its square (thingsquared)"""
    thingsquared = thingtosquare ** 2     # simply squares the input variable, and returning it
    return thingsquared


# In[17]:


squared(4)


# In[18]:


print squared(b),help(squared)


# In[19]:


# define a function to make a gaussian with input values, used later
def mygauss(x, amp, center, sigma):
    """This is an example gaussian function, which takes in x values, the amplitude (amp),
    the center x value (center) and the sigma of the Gaussian, and returns the respective y values."""
    y = amp * np.exp(-.5*((x-center)/sigma)**2)
    return y


# In[20]:


npts = 40   # the number of points on the x axis
x = np.linspace(0,10,npts)    # make a series of npts linearly spaced values between 0 and 10
amp = 3.1
center = 5.1
sigma = 1.11
y = mygauss(x, amp, center, sigma)
print y


# In[21]:


plt.figure(figsize=(10,6))
plt.plot(x,y,'bo', label='data points')     
plt.text(center, amp, "<-- peak is here",fontsize=16)     # places text at any x/y location on the graph
plt.xlabel('X axis',fontsize=20)
plt.ylabel('Y axis', fontsize=20)
plt.title('A gaussian plot \n with some extras!',fontsize=20)
plt.legend(loc='best')


# # Now let's fake some measurement process by adding some errors, and fit  this with a model

# ### Simulate some data, using a Gaussian with random noise added

# In[133]:


def mygauss_errors(x, amp, center, sigma, noise_amp):
    """Takes in x values, amplitude,center,sigma for a guassian, as well as a noise amplitude
    which multiplies a random normally distributed (sigma=1,center=0) noise that is added to the gaussian"""
    y = amp * np.exp(-.5*((x-center)/sigma)**2) #+ np.random.randn(len(x)) 
    yerr = y + np.random.randn(len(x)) * noise_amp
    return yerr

# make up some hidden parameters for the noisy Gaussian, we will try to discover them with optimization later
npts, minx, maxx = 15,0,10
x = linspace(minx,maxx,npts)    # making linearly spaced x values between min and max with n points
amp, center, sigma = 50.1,5.3,1.11   # made up hidden parameters
noise_amp = .05*amp   # this multiplies a unit normal distribution to simulate noise, larger means increased noise

# get the noisy y values using the above input parameters
yerr = mygauss_errors(x, amp, center, sigma, noise_amp)

## for plotting purposeses, also make a more "continuous" x variable, with much finer spacing
xcont=linspace(0,10,1000)
ycont = mygauss(xcont,amp,center,sigma) # also get the true continuous y values

plt.figure(figsize=(8,4))
plt.plot(xcont,ycont,'r',label='True values (nature)')

plt.errorbar(x,yerr,yerr=noise_amp,marker='o',linestyle='None',label='Noisy data')
plt.title('Fake data example with a noisy Gaussian')
plt.legend()


# 
# # Now that we have some fake data, let's pretend we don't know the parameters and try to find them via $\chi^2$ minimization!

# In[110]:


# to fit a function to some data, you will need to minimize (optimize) some statistic
# scipy has a function in it which will take in a model and data and find the best parameters
# for you using a chi-squared minimization
from scipy.optimize import curve_fit

# you can always google for more info about common functions, but python has documentation strings
# that you can access in the notebook by typing

help(curve_fit)

# or also executing "curve_fit?" would pop up a window with the same information


# In[134]:


# use the curve_fit function to find optimal parameters for the mygauss function, given the data
# (it does better with a good guess for starting parameters, though none or even bad guesses work)
ampguess,centerguess,sigmaguess=42,4.2,.42
params,cov=curve_fit(mygauss,x,yerr,p0=[ampguess,centerguess,sigmaguess])

print("These are the best fit amplitude, center, and sigma")
print(params)
print("These are the covariances between those parameters (i.e. diagonal elements are sigma**2 for each param.)")
print(cov)

# these will print out the values and their errors, according to the covariance matrix
print "True amplitude, center, and sigma: " +str(amp)+", "+str(center)+', '+str(sigma)
print "Fitted amplitude, center, and sigma: " +str(params)
print "With estimated errors +/- " +str(np.diag(cov))
chi2 = 1./(len(x)-3)*np.sum((yerr-mygauss(x,*params))**2/(noise_amp)**2)
print 'Reduced chi-squared of fit: '+str(chi2)


# In[135]:


str(zip(params,np.diag(cov)))


# # Now plot all the curves together: the noisy data, the true nature, and the fit

# In[136]:


plt.figure(figsize=(10,6))

plt.errorbar(x,yerr,yerr=noise_amp,marker='o',linestyle='None',label='Noisy data')

plt.plot(xcont,mygauss(xcont,amp,center,sigma),label='True data (nature)')
fiterrs=np.sqrt(np.diag(cov))
plt.fill_between(xcont,mygauss(xcont,*params-fiterrs),mygauss(xcont,*params+fiterrs),color='orange',alpha=.5)
plt.plot(xcont,mygauss(xcont,*params),label='Fitted curve',color='r')
plt.axhline(0, color='k', linestyle='--')
plt.xlabel('X axis',fontsize=20)
plt.ylabel('Y axis', fontsize=20)
plt.title('Our result: A noisy Gaussian with fit',fontsize=20)
plt.legend(loc='upper left')


# ### time is important, some time is worth keeping track

# In[140]:


import time
print time.time()   #one way to keep track of time is to use the system time
get_ipython().run_line_magic('pinfo', 'time.time')


# ## Just for fun... we can repeat all the above, faking the data, and fitting it as many times as we want. Does it return reasonable model fits? This is a way of checking the measurement/modeling process for bias. Hint: lower/raise the number of points (npts) and noise amplitude (noise_amp) variables below to see how e.g. the reduced chi-squared distribution changes

# In[147]:


npts, minx, maxx = 100,0,9
x = linspace(minx,maxx,npts)    # making linearly spaced x values between min and max with n points
amp, center, sigma = 50.1,5.3,1.11   # made up hidden parameters
noise_amp = 2   # this multiplies a unit normal distribution to simulate noise, larger means increased noise


# Now just repeat the experiment with all the above true parameters
# and store the resulting model parameters
nexpts=10000    # the number of times to run the expt, 10k takes a few seconds
allchi2=np.zeros(nexpts)
allamp,allcen,allsig=np.zeros(nexpts),np.zeros(nexpts),np.zeros(nexpts)
allampstd,allcenstd,allsigstd=np.zeros(nexpts),np.zeros(nexpts),np.zeros(nexpts)

tstart=time.time()   #mark the start time
for i in range(nexpts):  # simply loop over repeated "experiments", saving the model parameters each time
    yerr = mygauss_errors(x, amp, center, sigma, noise_amp)   # fake the data
    params,cov=curve_fit(mygauss,x,yerr,p0=[ampguess,centerguess,sigmaguess])    # fit the model
    allchi2[i]=1./(len(x)-3)*np.sum((yerr-mygauss(x,*params))**2/(noise_amp)**2)  # record chi2
    allamp[i],allcen[i],allsig[i]=params    # save parameters
    allampstd[i],allcenstd[i],allsigstd[i]=np.array([np.sqrt(cov[diag][diag]) for diag in range(3)])

print "took "+str(time.time()-tstart)+" seconds"   #current time minus start time is total time to run


# ### there are lots of things we could now plot... for instance, is the reduced chi2 distribution reasonable?

# In[148]:


plt.hist(allchi2,bins=100,histtype='step')
plt.xlabel('$\\chi^2_r$',fontsize=20)
plt.title('The $\\chi^2$ of '+str(nexpts)+' repeated "measurement simulations"')
plt.axvline(1,color='k')


# ### looking at the distribution of model fits and residuals

# In[144]:


nplots=50
thetruey=mygauss(xcont,amp,center,sigma)
plt.figure(figsize=(10,4))
for i in range(nplots):
    plt.subplot(121)
    
    thefity=mygauss(xcont,allamp[i],allcen[i],allsig[i])
    plt.plot(xcont,thefity,'b.',alpha=.01)                     #plot the best fit model for this expt
    plt.subplot(122)
    plt.plot(xcont,(thefity-thetruey)/amp*100.,'k.',alpha=.01)  #plot the residuals


plt.ylabel('% difference')
plt.title('residuals of best fit model minus truth')
plt.axhline(0,color='k',linestyle='-',label='residuals')
plt.xlabel('x')
plt.legend()
plt.subplot(121)
plt.plot(xcont,ycont,'k',linewidth=2,label='true nature') #plot the true nature being measured
plt.title('spread of best fit model \n in repeated experiments')
plt.legend()


# In[150]:


plt.figure(figsize=(12,3))
plt.suptitle('Reported parameter fitting "errors"')
plt.subplot(131)
plt.hist(allampstd)
xlabel('amplitude')
plt.subplot(132)
plt.hist(allcenstd)
plt.xlabel('center')
plt.subplot(133)
plt.hist(allsigstd)
plt.xlabel('sigma')
# why is the std for sigma and center the same??


# In[117]:


#This was plotting up Xiangdong's interesting lab data for the first class

xdat = np.recfromtxt('/home/andrew/Work/Data sorted by Voltage.txt',skip_header=2,
                     names=['n','pvol','mvol','foo','bar'])

plt.figure(figsize=(10,4))
plt.subplot(121)
plt.plot(xdat['n'],xdat['pvol'],label='+10 V',alpha=.7)
plt.plot(xdat['n'],xdat['mvol'],label='-10 V',alpha=.7)
plt.xlabel('Measurement #')
plt.ylabel('Voltage [$10^-3 V$]')
plt.legend()
plt.subplot(122)
plt.hist(xdat['pvol'],bins=100,histtype='step',label='+10 V')
plt.hist(xdat['mvol'],bins=100,histtype='step',label='-10 V')
pvmed,nvmed=np.median(xdat['pvol']),np.median(xdat['mvol'])
pvstd,nvstd=np.std(xdat['pvol'])/np.sqrt(len(xdat)),np.std(xdat['mvol'])/np.sqrt(len(xdat))
print("The median of +10/-10 voltage measurements is:",pvmed,nvmed)
print("The standard error is:",pvstd,nvstd)
plt.axvline(0,color='k')
plt.axvline(pvmed,color='b')
plt.axvline(nvmed,color='g')
plt.xlabel('millivolts')
plt.legend()
plt.suptitle("WELCOME TO $\\rho\\hbar\\psi s 122$",fontsize=30)


# In[ ]: