#!/usr/bin/env python # coding: utf-8 # # The classic start, simple and easy in python # In[1]: print("Hello world!") # # Easy to use as a calculator # In[2]: 2+2 # # Python is a dynamic language, no need to declare variable types before assignment # In[3]: foo = 2 # this is a comment bar = 2 foo+bar,foo-bar,foo*bar,foo/bar,foo**bar # # There are floats, integers, strings, lists # In[4]: a = [2.01, 3, 'hello'] # lists can contain anything, print(a, a[2], a[0:2], "<--- array slicing example") # showing list/array slicing print(a*2, "<--- multiplying a list") # multiplying a list a.append('goodbye') # adding to a list print(a, "<--- appending a list") # # You can also import packages for useful code content, such as functions, constants, whatever # In[5]: import math # the import function print(math.pi) # now you've got pi! # In[6]: from math import * # another way to import things, but this method isn't recommended print(pi) # In[7]: from numpy import pi as anotherpi # the recommended way to import single functions import numpy as np # import a package as a new name, this is the most common print(anotherpi) print(np.pi) # # Numpy is a powerful pacakge that you'll use a lot # In[8]: print(np.zeros(10)) # make an empty array print(np.ones(10)) # make an array of ones b = np.arange(10) # make an array of integer indices print(b) # In[9]: print(b*b) # multiplying arrays of same size print((b*b).reshape(2,5)) # you can also reshape arrays # # numpy also has useful functions like mean, std # In[10]: testmean=10 teststd=2 ntestpoints=100 # use numpy's random number sampling from a normal distribution (which has a sigma=1) # to generate some fake data testdata=np.ones(ntestpoints)*testmean + np.random.randn(ntestpoints)*teststd themean = np.sum(testdata)/ntestpoints # calculating the mean without numpy stderr = np.sqrt( 1./(ntestpoints-1) * np.sum( (np.mean(testdata)-testdata)**2) ) #calculating the std error error_on_mean = stderr/(sqrt(ntestpoints)) print "The mean: "+ str(np.mean(testdata))+", or "+str(themean) # our mean and numpy's mean should be the same print "The standard error: " +str(np.std(testdata,ddof=1))+", or "+str(stderr) #note the ddof=1 print "The error on the mean: " + str(error_on_mean) # # Plotting is inside the matplotlib.pyplot package # In[11]: import matplotlib.pyplot as plt get_ipython().run_line_magic('pylab', 'inline') # In[12]: plt.figure(figsize=(10,6)) plt.hist(testdata,bins=ntestpoints/10,histtype='stepfilled',alpha=.5,color='g', range=[5,15]) plt.xlabel('Value',fontsize=20) plt.ylabel('Number of measurements \n in each bin',fontsize=20) plt.title('Histogram of test data',fontsize=22) plt.axvline(themean,linestyle='-',color='r') plt.axvline(themean+error_on_mean,linestyle='--',color='b') plt.axvline(themean-error_on_mean,linestyle='--',color='b') plt.axvline(testmean,color='k',linestyle='-') # # You can also read in data from a text file with numpy's loadtxt function # In[13]: # Click on the menu for the graph on # https://www.google.com/trends/explore?q=%2Fm%2F02896,%2Fm%2F05g3b,Super%20Bowl # and go to "Download csv" to download this data, # and put its filename below filename='/home/andrew/Downloads/cowboys.csv' # this data has four columns # date, cowboys searches, patriots searches, superbowl searches # Use numpy's recfromcsv function to read in the csv file from Google # another option is to use np.loadtxt # (an example of that syntax is at the end of this notebook) goog=np.recfromcsv(filename,skip_header=3,names=['week','cowboys','pats','supabowl']) # the skip_header call is to skip a few lines at the top that aren't data # and the names allow us to now reference the records using a variable name rather than integers timedat=np.array([np.array(int(datedat.split('-')[0])+int(datedat.split('-')[1])/12.) for datedat in goog['week']]) #this translates the "2004-01" string into fractional years as a floating point number plt.figure(figsize=(10,3)) for datname in ['cowboys','pats','supabowl']: plt.plot(timedat,goog[datname],label=datname,linewidth=2,alpha=.7) plt.legend(loc='upper center',fontsize=10) #plt.yscale('log') plt.ylabel('Google interest [arb. units]',fontsize=12) plt.xlabel('Time since 0 BCE [years]',fontsize=12) plt.title('Hand-egg sports teams and their championships \n as measured through Google search interest') plt.xticks(np.arange(2004,2018)) print ('how bout dem cowboys') # put a vertical line for every patriots superbowl visit patsup=[2004,2007,2011,2014] #years they went to the superbowl for goodyear in patsup: # loop over them plt.axvline(goodyear+1.2,color='k') # # You can define functions to operate on variables # In[16]: # start function definitions with the "def " and the name of the function # inside the parentheses you place the variables which squared takes in def squared(thingtosquare): """This is a documentation string for the function squared, which takes in a number (thingtosquare) and outputs its square (thingsquared)""" thingsquared = thingtosquare ** 2 # simply squares the input variable, and returning it return thingsquared # In[17]: squared(4) # In[18]: print squared(b),help(squared) # In[19]: # define a function to make a gaussian with input values, used later def mygauss(x, amp, center, sigma): """This is an example gaussian function, which takes in x values, the amplitude (amp), the center x value (center) and the sigma of the Gaussian, and returns the respective y values.""" y = amp * np.exp(-.5*((x-center)/sigma)**2) return y # In[20]: npts = 40 # the number of points on the x axis x = np.linspace(0,10,npts) # make a series of npts linearly spaced values between 0 and 10 amp = 3.1 center = 5.1 sigma = 1.11 y = mygauss(x, amp, center, sigma) print y # In[21]: plt.figure(figsize=(10,6)) plt.plot(x,y,'bo', label='data points') plt.text(center, amp, "<-- peak is here",fontsize=16) # places text at any x/y location on the graph plt.xlabel('X axis',fontsize=20) plt.ylabel('Y axis', fontsize=20) plt.title('A gaussian plot \n with some extras!',fontsize=20) plt.legend(loc='best') # # Now let's fake some measurement process by adding some errors, and fit this with a model # ### Simulate some data, using a Gaussian with random noise added # In[133]: def mygauss_errors(x, amp, center, sigma, noise_amp): """Takes in x values, amplitude,center,sigma for a guassian, as well as a noise amplitude which multiplies a random normally distributed (sigma=1,center=0) noise that is added to the gaussian""" y = amp * np.exp(-.5*((x-center)/sigma)**2) #+ np.random.randn(len(x)) yerr = y + np.random.randn(len(x)) * noise_amp return yerr # make up some hidden parameters for the noisy Gaussian, we will try to discover them with optimization later npts, minx, maxx = 15,0,10 x = linspace(minx,maxx,npts) # making linearly spaced x values between min and max with n points amp, center, sigma = 50.1,5.3,1.11 # made up hidden parameters noise_amp = .05*amp # this multiplies a unit normal distribution to simulate noise, larger means increased noise # get the noisy y values using the above input parameters yerr = mygauss_errors(x, amp, center, sigma, noise_amp) ## for plotting purposeses, also make a more "continuous" x variable, with much finer spacing xcont=linspace(0,10,1000) ycont = mygauss(xcont,amp,center,sigma) # also get the true continuous y values plt.figure(figsize=(8,4)) plt.plot(xcont,ycont,'r',label='True values (nature)') plt.errorbar(x,yerr,yerr=noise_amp,marker='o',linestyle='None',label='Noisy data') plt.title('Fake data example with a noisy Gaussian') plt.legend() # # # Now that we have some fake data, let's pretend we don't know the parameters and try to find them via $\chi^2$ minimization! # In[110]: # to fit a function to some data, you will need to minimize (optimize) some statistic # scipy has a function in it which will take in a model and data and find the best parameters # for you using a chi-squared minimization from scipy.optimize import curve_fit # you can always google for more info about common functions, but python has documentation strings # that you can access in the notebook by typing help(curve_fit) # or also executing "curve_fit?" would pop up a window with the same information # In[134]: # use the curve_fit function to find optimal parameters for the mygauss function, given the data # (it does better with a good guess for starting parameters, though none or even bad guesses work) ampguess,centerguess,sigmaguess=42,4.2,.42 params,cov=curve_fit(mygauss,x,yerr,p0=[ampguess,centerguess,sigmaguess]) print("These are the best fit amplitude, center, and sigma") print(params) print("These are the covariances between those parameters (i.e. diagonal elements are sigma**2 for each param.)") print(cov) # these will print out the values and their errors, according to the covariance matrix print "True amplitude, center, and sigma: " +str(amp)+", "+str(center)+', '+str(sigma) print "Fitted amplitude, center, and sigma: " +str(params) print "With estimated errors +/- " +str(np.diag(cov)) chi2 = 1./(len(x)-3)*np.sum((yerr-mygauss(x,*params))**2/(noise_amp)**2) print 'Reduced chi-squared of fit: '+str(chi2) # In[135]: str(zip(params,np.diag(cov))) # # Now plot all the curves together: the noisy data, the true nature, and the fit # In[136]: plt.figure(figsize=(10,6)) plt.errorbar(x,yerr,yerr=noise_amp,marker='o',linestyle='None',label='Noisy data') plt.plot(xcont,mygauss(xcont,amp,center,sigma),label='True data (nature)') fiterrs=np.sqrt(np.diag(cov)) plt.fill_between(xcont,mygauss(xcont,*params-fiterrs),mygauss(xcont,*params+fiterrs),color='orange',alpha=.5) plt.plot(xcont,mygauss(xcont,*params),label='Fitted curve',color='r') plt.axhline(0, color='k', linestyle='--') plt.xlabel('X axis',fontsize=20) plt.ylabel('Y axis', fontsize=20) plt.title('Our result: A noisy Gaussian with fit',fontsize=20) plt.legend(loc='upper left') # ### time is important, some time is worth keeping track # In[140]: import time print time.time() #one way to keep track of time is to use the system time get_ipython().run_line_magic('pinfo', 'time.time') # ## Just for fun... we can repeat all the above, faking the data, and fitting it as many times as we want. Does it return reasonable model fits? This is a way of checking the measurement/modeling process for bias. Hint: lower/raise the number of points (npts) and noise amplitude (noise_amp) variables below to see how e.g. the reduced chi-squared distribution changes # In[147]: npts, minx, maxx = 100,0,9 x = linspace(minx,maxx,npts) # making linearly spaced x values between min and max with n points amp, center, sigma = 50.1,5.3,1.11 # made up hidden parameters noise_amp = 2 # this multiplies a unit normal distribution to simulate noise, larger means increased noise # Now just repeat the experiment with all the above true parameters # and store the resulting model parameters nexpts=10000 # the number of times to run the expt, 10k takes a few seconds allchi2=np.zeros(nexpts) allamp,allcen,allsig=np.zeros(nexpts),np.zeros(nexpts),np.zeros(nexpts) allampstd,allcenstd,allsigstd=np.zeros(nexpts),np.zeros(nexpts),np.zeros(nexpts) tstart=time.time() #mark the start time for i in range(nexpts): # simply loop over repeated "experiments", saving the model parameters each time yerr = mygauss_errors(x, amp, center, sigma, noise_amp) # fake the data params,cov=curve_fit(mygauss,x,yerr,p0=[ampguess,centerguess,sigmaguess]) # fit the model allchi2[i]=1./(len(x)-3)*np.sum((yerr-mygauss(x,*params))**2/(noise_amp)**2) # record chi2 allamp[i],allcen[i],allsig[i]=params # save parameters allampstd[i],allcenstd[i],allsigstd[i]=np.array([np.sqrt(cov[diag][diag]) for diag in range(3)]) print "took "+str(time.time()-tstart)+" seconds" #current time minus start time is total time to run # ### there are lots of things we could now plot... for instance, is the reduced chi2 distribution reasonable? # In[148]: plt.hist(allchi2,bins=100,histtype='step') plt.xlabel('$\\chi^2_r$',fontsize=20) plt.title('The $\\chi^2$ of '+str(nexpts)+' repeated "measurement simulations"') plt.axvline(1,color='k') # ### looking at the distribution of model fits and residuals # In[144]: nplots=50 thetruey=mygauss(xcont,amp,center,sigma) plt.figure(figsize=(10,4)) for i in range(nplots): plt.subplot(121) thefity=mygauss(xcont,allamp[i],allcen[i],allsig[i]) plt.plot(xcont,thefity,'b.',alpha=.01) #plot the best fit model for this expt plt.subplot(122) plt.plot(xcont,(thefity-thetruey)/amp*100.,'k.',alpha=.01) #plot the residuals plt.ylabel('% difference') plt.title('residuals of best fit model minus truth') plt.axhline(0,color='k',linestyle='-',label='residuals') plt.xlabel('x') plt.legend() plt.subplot(121) plt.plot(xcont,ycont,'k',linewidth=2,label='true nature') #plot the true nature being measured plt.title('spread of best fit model \n in repeated experiments') plt.legend() # In[150]: plt.figure(figsize=(12,3)) plt.suptitle('Reported parameter fitting "errors"') plt.subplot(131) plt.hist(allampstd) xlabel('amplitude') plt.subplot(132) plt.hist(allcenstd) plt.xlabel('center') plt.subplot(133) plt.hist(allsigstd) plt.xlabel('sigma') # why is the std for sigma and center the same?? # In[117]: #This was plotting up Xiangdong's interesting lab data for the first class xdat = np.recfromtxt('/home/andrew/Work/Data sorted by Voltage.txt',skip_header=2, names=['n','pvol','mvol','foo','bar']) plt.figure(figsize=(10,4)) plt.subplot(121) plt.plot(xdat['n'],xdat['pvol'],label='+10 V',alpha=.7) plt.plot(xdat['n'],xdat['mvol'],label='-10 V',alpha=.7) plt.xlabel('Measurement #') plt.ylabel('Voltage [$10^-3 V$]') plt.legend() plt.subplot(122) plt.hist(xdat['pvol'],bins=100,histtype='step',label='+10 V') plt.hist(xdat['mvol'],bins=100,histtype='step',label='-10 V') pvmed,nvmed=np.median(xdat['pvol']),np.median(xdat['mvol']) pvstd,nvstd=np.std(xdat['pvol'])/np.sqrt(len(xdat)),np.std(xdat['mvol'])/np.sqrt(len(xdat)) print("The median of +10/-10 voltage measurements is:",pvmed,nvmed) print("The standard error is:",pvstd,nvstd) plt.axvline(0,color='k') plt.axvline(pvmed,color='b') plt.axvline(nvmed,color='g') plt.xlabel('millivolts') plt.legend() plt.suptitle("WELCOME TO $\\rho\\hbar\\psi s 122$",fontsize=30) # In[ ]: