#!/usr/bin/env python # coding: utf-8 # # Week 05 - Numpy II and Scipy # # ## Today's Agenda # - Numpy II # - Scipy # # Numpy II # # Last time in [Week 05](https://github.com/VandyAstroML/Vanderbilt_Computational_Bootcamp/blob/master/notebooks/Week_05/05_Numpy_Matplotlib.ipynb), we covered `Numpy` and `Matplotlib`. This time we will be focusing on more advanced concepts of `Numpy`. # In[1]: # Loading modules get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt # ## Review # As a review, let's explore some of the concepts that were introduced last time, in `Numpy I`. # # ### Create 1D-arrays # We introduced how to create a 1D-array # In[2]: x = np.array([1,2,3,5,6,7,8,10],dtype=float) x # In[3]: y = np.arange(10) y # In[4]: z = np.linspace(0,100,50) z # In[5]: h = np.random.randn(100) h # ### Handling arrays # # These are just a few of the different ways to __create__ numpy arrays. # # You can also use functions like __np.max()__ and __np.min()__ to get the maximum and minimum values, respectively. # In[6]: print('Min X: {0:.3f} \t Max X: {1:.3f}'.format(np.min(x), np.max(x)) ) # #### Apply mathematical functions # In[7]: zz = x**2 + 3*x**3 zz # #### Conditionals # Find the indices of the elements in an array that meet some criteria. # In this example, we're finding all the elements that are within 100 and 500 in array ___"```zz```"___. # In[8]: zz_idx = np.where((zz>= 100)&(zz <= 500))[0] print('zz_idx: {0}'.format(zz_idx)) zz[zz_idx] # ## Manipulating Arrays # There are a lot of things we can do to a _numpy array_. # In[9]: h1 = np.random.randint(10, 50, 50) h1 # We can get the overall __size__ and __shape__ of the array. # # We cane use the functions `numpy.size` and `numpy.shape` to get the total number of elements in an array and the shape of the array, respectively. # In[10]: np.size(h1) # In[11]: h1.shape # In[12]: A = np.array([[1,2,3,4,5], [6,7,8,9,10], [12,13,14,16,17], [13,45,67,89,90] ]) A # In[13]: np.shape(A) # You can also __transpose__ array `A`. # In[14]: A_t = np.transpose(A) A_t # Why are `Numpy` arrays better than lists: # - Python lists are very _general_. # - Lists __do not__ support matrix and dot multiplications, etc. # - `Numpy` arrays are memory _efficient_. # - Numpy arrays are __statically typed__ and __homogeneous__. # - They are fast at mathematical functions. # - They can be used in compiled languages, e.g. _C_ and _Fortran_. # ### Array-generating functions # For large arrays it is inpractical to initialize the data manually, using normal Python lists. Instead, we can use many of the Numpy functions to generate arrays of different forms. # #### __numpy.arange__ # We use this one to create a sequence of ordered elements # In[15]: np.arange(0,10,1) # In[16]: np.arange(0,20,5) # In[17]: np.arange(-40,21,10) # #### linspace and logspace # We use these functions to created ordered lists, separated by intervals in _real_- and _log_-space. # In[18]: B = np.linspace(0,50) B # In[19]: B = np.linspace(0,100, 20) B # Array of __25 elements__ from $10^{0}$ to $10^{3}$, with __base of 10__. # In[20]: B = np.logspace(0,3,25) B # Creating an array of 11 elements from $e^{0}$ to $e^{10}$, with the ```base == numpy.e``` # In[21]: B = np.logspace(0,10,11, base=np.e) B # #### Random Data # In[22]: from numpy import random # In[23]: # Uniform random numbers in [0,1] random.rand(5,5) # In[24]: # 20 Random integers from 10 to 30 random.randint(10,30,20) # #### Arrays of `zeros` and `ones`. # In[25]: np.zeros(20) # You can use these to populate other arrays # In[26]: nelem = 10 C = np.ones(10) C # In[27]: for ii in range(C.size): C[ii] = random.rand() C # ### Diagonals # You can also construct an array with another array as the diagonal # In[28]: np.diag(random.randint(10,20,5)) # ### Indexing # You can choose which values to select. # Normally, you select the `rows` first, and then the `cols` of a `numpy.ndarray`. # In[29]: M = random.rand(10,5) M # Selecting the 1st row # In[30]: M[1,:] # The 2nd column # In[31]: M[:,1] # Select a range of columns and rows # In[32]: M[1:3, 2:4] # You can easily use this to create a __mask__, for when you are _cleaning_ your data. # In[33]: A = random.rand(3,3) np.fill_diagonal(A, np.nan) A # In[34]: B = np.arange(0,9).reshape((3,3)) B # Appying the __mask__ from $A \to B$ # In[35]: A_mask = np.isfinite(A) A_mask # In[36]: B[A_mask] # ### Binning you data # This is probably one of the best functions of `Numpy`. # You can use this to bin you data, and calculate __means__, __standard deviations__, etc. # #### numpy.digitize # In[37]: # Creating my bin edges bins = np.arange(0,13) bins # In[38]: # Generating Data data = 10*random.rand(100) data # Now I want to bin my data and calculate the mean for each bin # In[39]: # Defining statistical function to use stat_func = np.nanmean # Binning the data data_bins = np.digitize(data, bins) data_bins # Calculating the __mean__ for each of the bins # In[40]: failval = -10 bins_stat = np.array([stat_func(data[data_bins == ii]) \ if len(data[data_bins == ii]) > 0 \ else failval \ for ii in range(1,len(bins))]) bins_stat = np.asarray(bins_stat) bins_stat # You can put all of this into a function that estimates errors and more... # In[41]: import math def myceil(x, base=10): """ Returns the upper-bound integer of 'x' in base 'base'. Parameters ---------- x: float number to be approximated to closest number to 'base' base: float base used to calculate the closest 'largest' number Returns ------- n_high: float Closest float number to 'x', i.e. upper-bound float. Example ------- >>>> myceil(12,10) 20 >>>> >>>> myceil(12.05, 0.1) 12.10000 """ n_high = float(base*math.ceil(float(x)/base)) return n_high def myfloor(x, base=10): """ Returns the lower-bound integer of 'x' in base 'base' Parameters ---------- x: float number to be approximated to closest number of 'base' base: float base used to calculate the closest 'smallest' number Returns ------- n_low: float Closest float number to 'x', i.e. lower-bound float. Example ------- >>>> myfloor(12, 5) >>>> 10 """ n_low = float(base*math.floor(float(x)/base)) return n_low def Bins_array_create(arr, base=10): """ Generates array between [arr.min(), arr.max()] in steps of `base`. Parameters ---------- arr: array_like, Shape (N,...), One-dimensional Array of numerical elements base: float, optional (default=10) Interval between bins Returns ------- bins_arr: array_like Array of bin edges for given arr """ base = float(base) arr = np.array(arr) assert(arr.ndim==1) arr_min = myfloor(arr.min(), base=base) arr_max = myceil( arr.max(), base=base) bins_arr = np.arange(arr_min, arr_max+0.5*base, base) return bins_arr # In[42]: def Mean_std_calc_one_array(x1, y1, arr_len=0, statfunc=np.nanmean, failval=np.nan, error='std', base=10.): """ Calculates statistics of two arrays, e.g. scatter, error in `statfunc`, etc. Parameters ---------- x1: array-like, shape (N,) array of x-values y1: array-like, shape (N,) array of y-values arr_len: int, optional (default = 0) minimum number of elements in the bin statfunc: numpy function, optional (default = numpy.nanmean) statistical function used to evaluate the bins failval: int or float, optional (default = numpy.nan) Number to use to replace when the number of elements in the bin is smaller than `arr_len` error: string, optional (default = 'std') type of error to evaluate Options: - 'std': Evaluates the standard deviation of the bin - 'stat': Evaluates the error in the mean/median of each bin - 'none': Does not calculate the error in `y1` base: float Value of bin width in units of that of `x1` Returns -------- x1_stat: array-like, shape (N,) `stat_func` of each bin in `base` spacings for x1 y1_stat: array-like, shape (N,) `stat_func` of each bin in `base` spacings for y1 """ x1 = np.asarray(x1) y1 = np.asarray(y1) assert((x1.ndim==1) & (y1.ndim==1)) assert((x1.size >0) & (y1.size>0)) n_elem = len(x1) ## Computing Bins x1_bins = Bins_array_create(x1, base=base) x1_digit = np.digitize(x1, x1_bins) ## Computing Statistics in bins x1_stat = np.array([statfunc(x1[x1_digit==ii]) if len(x1[x1_digit==ii])>arr_len else failval for ii in range(1,x1_bins.size)]) y1_stat = np.array([statfunc(y1[x1_digit==ii]) if len(y1[x1_digit==ii])>arr_len else failval for ii in range(1,x1_bins.size)]) ## Computing error in the data if error=='std': stat_err = np.nanstd y1_err = np.array([stat_err(y1[x1_digit==ii]) if len(y1[x1_digit==ii])>arr_len else failval for ii in range(1,x1_bins.size)]) if error!='none': y1_err = np.array([stat_err(y1[x1_digit==ii])/np.sqrt(len(y1[x1_digit==ii])) if len(y1[x1_digit==ii])>arr_len else failval for ii in range(1,x1_bins.size)]) if (stat_func==np.median) or (stat_func==np.nanmedian): y1_err *= 1.253 else: y1_err = np.zeros(y1.stat.size) return x1_stat, y1_stat, y1_err # __Example of using these function__: # In[43]: import numpy as np # Defining arrays x_arr = np.arange(100) y_arr = 50*np.random.randn(x_arr.size) # Computing mean and error in the mean for `x_arr` and `y_arr` x_stat, y_stat, y_err = Mean_std_calc_one_array(x_arr, y_arr, statfunc=np.nanmean, failval=np.nan, base=10) x_stat2, y_stat2, y_err2 = Mean_std_calc_one_array(x_arr, y_arr, statfunc=np.nanmedian, failval=np.nan, base=10) # In[44]: plt.style.use('seaborn-notebook') plt.clf() plt.close() fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111,facecolor='white') ax.plot(x_arr, y_arr, 'ro', label='Data') ax.errorbar(x_stat, y_stat, yerr=y_err, color='blue', marker='o', linestyle='--',label='Mean') ax.errorbar(x_stat2, y_stat2, yerr=y_err2, color='green', marker='o', linestyle='--',label='Median') ax.set_xlabel('X axis', fontsize=20) ax.set_ylabel('Y axis', fontsize=20) ax.set_title('Data and the Binned Data', fontsize=24) plt.legend(fontsize=20) plt.show() # With this function, it is really easy to apply statistics on binned data, as well as to ___estimate errors___ on the data. # ### Reshaping, resizing and stacking arrays # One can always modify the shape of a `numpy.ndarray`, as well as append it to a pre-existing array. # In[45]: A = np.array([[n+m*10 for n in range(5)] for m in range(5)]) A # In[46]: n, m = A.shape # In[47]: B = A.reshape((1,n*m)) B # In[48]: A_f = A.flatten() A_f # In[49]: C = random.rand(A.size) C # In[50]: C.shape # In[51]: # Stacking the two arrays D = np.column_stack((A_f,C)) D # In[52]: # Selecting from 3rd to 11th row D[2:10] # #### np.concatenate # You can also concadenate different arrays # In[53]: a = np.array([[1, 2], [3, 4]]) b = np.array([[5,6]]) # In[54]: np.concatenate((a,b)) # In[55]: np.concatenate((a,b.T), axis=1) # ### Copy and "Deep Copy" # Sometimes it is important to create new _copies_ of arrays and other objects. For this reason, one uses __numpy.copy__ to create _new_ copies of arrays # In[56]: A = np.array([[1, 2], [3, 4]]) A # In[57]: # `B` is now referring to the same array data as `A` B = A # If we make any changes to `B`, __`A` will also be affected by this change__. # In[58]: B[0,0] = 10 B # In[59]: A # To get a __completely independent, new object__, you would use: # In[60]: B = np.copy(A) # Modifying `B` B[0,0] = -5 B # In[61]: A # The array `A` was not affected by this changed. This is important when you're constantly re-defining new arrays # # Scipy - Library of Scientific Algorithms for Python # `SciPy` provides a large number of higher-level scientif algorithms. # It includes: # - Special Functions # - Integration # - Optimization # - Interpolation # - Fourier Transforms # - Signal Processing # - Linear Algebra # - Statistics # - Multi-dimensional image processing # In[62]: import scipy as sc # ## Interpolation # You can use `Scipy` to interpolate your data. # You would use the `interp1d` function to interpolate your function. # In[63]: from scipy.interpolate import interp1d # In[64]: def f(x): return np.sin(x) # In[65]: n = np.arange(0, 10) x = np.linspace(0, 9, 100) y_meas = f(n) + 0.1 * np.random.randn(len(n)) # simulate measurement with noise y_real = f(x) linear_interpolation = interp1d(n, y_meas) y_interp1 = linear_interpolation(x) cubic_interpolation = interp1d(n, y_meas, kind='cubic') y_interp2 = cubic_interpolation(x) # In[66]: fig, ax = plt.subplots(figsize=(15,6)) ax.set_facecolor('white') ax.plot(n, y_meas, 'bs', label='noisy data') ax.plot(x, y_real, 'k', lw=2, label='true function') ax.plot(x, y_interp1, 'r', label='linear interp') ax.plot(x, y_interp2, 'g', label='cubic interp') ax.legend(loc=3, prop={'size':20}); ax.tick_params(axis='both', which='major', labelsize=20) ax.tick_params(axis='both', which='minor', labelsize=15) # ### KD-Trees # You can also use `SciPy` to calculate __KD-Trees__ for a set of points # In[67]: Lbox = 250. Npts = 1000 # Creating cartesian coordinates x = np.random.uniform(0, Lbox, Npts) y = np.random.uniform(0, Lbox, Npts) z = np.random.uniform(0, Lbox, Npts) sample1 = np.vstack([x, y, z]).T sample1 # In[68]: sample1.shape # Let's say we want to know how many points are within distances of 30 and 50 from other points. To know this, you construct a __KD-Tree__ # In[69]: from scipy.spatial import cKDTree # In[70]: # Initializing KDTree KD_obj = cKDTree(sample1) # In[71]: N_neighbours = cKDTree.count_neighbors(KD_obj, KD_obj, 50) - \ cKDTree.count_neighbors(KD_obj, KD_obj, 30) print("Number of Neighbours: {0}".format(N_neighbours)) # Let's say you want to get the distances to the __Nth-nearest__ neighbor. # In[72]: k_nearest = 4 dist_k, dist_k_idx = cKDTree.query(KD_obj, sample1, k_nearest) dist_k # You can also get the indices # In[73]: dist_k_idx # The first columns corresponds to _itself_. # You can also find pairs that are separated by at most a distance _r_ # In[74]: pairs = KD_obj.query_ball_tree(KD_obj, 30) pairs[0:10] # __So that's it for today's lesson__ # # Next week: # - Advanced Visualization # - Seaborn