#!/usr/bin/env python # coding: utf-8 # # SciPy and NumPy Cheat Sheet # The core libraries in Python are NumPy, SciPy, Matplotlib and Pandas. NumPy provides homogeneous multidimensional arrays, SciPy provides functions and operators for mathematical computation # Matplotlib can be used to plot functions and Pandas is used for non-homogeneous data manipulation and reading and writing files. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') # ## Python built-in data structures # Python offers the following built-in data structures: tutple, list, dictionary and set. We will discuss the array data structure in a following sextion about the NumPy numerical package. # ### Tuple # A tutple is an immutable collection of immutable items that can be of different types: int, float, bool, string. A tuple provides few operators such as count() and index() # In[2]: t = 1, 2.6, 'New York', 'New York' c = t.count('New York') i = t.index('New York') print("Number of occurrences: {0:d}\nIndex of 1st occurrence: {1:d}\n".format(c, i)) # ### List # A Python list is a mutable collection of mutable items that can be of different types. In other words, a list can be expanded and reduced and its items are mutable. The functions used to expand a list are: append(), extend(), insert(). # In[3]: alist = [1,2,5] print(alist) # prints the list print(alist[0]) # prints the 1st element of the list # We can add one item at a time to the end of the list using append() # In[4]: alist.append(7) print(alist) # We can use extend() to add more items # In[5]: alist.extend([8, 3, 2]) print(alist) # Finally, we can add an element at a certain index in the list # In[6]: alist.insert(1, 'New York') print(alist) # We can remove the 1st occurrence of an item that contains a particular value from the list # In[7]: alist.remove('New York') print(alist) # We can also return an item with a certain index value and remove it from the list # In[8]: v = alist.pop(3) v # In[9]: print(alist) # We can slice a tuple or a list to select subsets of those collections # In[10]: alist[2:4] # We can look for an item in a list of strings # In[18]: cities = ['Rome', 'Paris', 'London', 'Berlin', 'Madrid', 'Athens'] # In[19]: def find_item(list, item): if (item in list): return 1 else: return 0 # In[20]: find_item(cities, 'Madrid') # ### Multi-dimensional list # In[11]: alist2d = [[0.1, 2], [3, 4]] print(alist2d) alist2d[0][0] # ### List comprehension # List comprehension can be used to apply a function to a list of objects, as with a for loop, like a map() function or a lambda function. # In[16]: families = ["Pippo/Pluto", "Qui/Quo/Qua"] # Let's define a function that splits the tokens in a string and extracts the last element # In[17]: def leaf(family): length = len(family.split("/")) leaf = family.split("/")[length - 1] return leaf # Now we apply the function to a list of strings using a list comprehension # In[18]: [leaf(family) for family in families] # We can achieve the same result with the following line, kind of lambda function # In[19]: [family.split("/")[len(family.split("/")) -1] for family in families ] # We can easily create nested list comprehension that work similarly to nested for loops # In[20]: [[(i,j) for j in range(1,5)] for i in range(1,5)] # ### Functional programming # We can get the same result by applying the leaf() function to each object in the families list using the map() function. # In[70]: list(map(leaf, families)) # ### Dictionary # A dictionary is a symbol table: a collection of objects in which each object is associated to a key. The collection can be expanded and reduced. # In[41]: cartoon_characters = {"Pippo": 5, "Pluto": 4, "Topolino": 2} # In[42]: [print("Name: " + name + ", score: " + str(score)) for name, score in cartoon_characters.items()] # We can print the objects' keys # In[81]: list(cartoon_characters.keys()) # and the values # In[83]: list(cartoon_characters.values()) # ## NumPy # A list can be used as an array: a collection of mutable objects of the same type: chars, integers or float or bool. Let's import the NumPy library # In[1]: import numpy as np # A NumPy array can be created from a list # In[2]: l = [1, 2, 4] array = np.array(l) print(array) # prints the full array type(array) # or from a range of number with start, end and step # In[3]: arange = np.arange(0, 9, 1) # an array from 0 to 9 with increment 1 print(arange) # A Python array is an object, an instance of a class, that contains a list of data of the same type and offers many functions to transform the data. We can write the data into a file # In[4]: path = 'physics/data/myarray.txt' with open(path, 'wb') as f: arange.tofile(f) # Then we can read the file and put the data into a new array # In[5]: with open(path, 'rb') as f: c = np.fromfile(f, dtype='int') print(c) # We can compute the sum, the mean, and the standard deviation of the elements in the array # In[6]: s = c.sum() m = c.mean() e = c.std() print("Sum: {0:.1f}\nMean: {1:.1f}\nStandard deviation: {2:.3f}".format(s, m, e)) # ### Bi-dimensional array # # In[7]: array2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) print(array2d) # prints the 2D array # Let's print the element in the 3rd row and 2nd column # In[8]: print(array2d[2, 1]) # An n-dimensional array is used to represent a transformation into an n-dimesional space. # In[9]: d = array2d.ndim # dimension of the array print("Array dimension: {0:d}".format(d)) # Each dimension consists of a tuple of numbers that represents its shape. # In[10]: array2d.shape # shape of the multidimensional array # We can compute the sum of the values of the array along one dimension or axix, for example along the columns # In[11]: array2d.sum(axis=0) # or along the rows # In[12]: array2d.sum(axis=1) # We may want to initialize an array with zeros # In[13]: zeros2d = np.zeros((3,3)) print(zeros2d) # or with ones # In[14]: ones2d = np.ones((3, 3)) print(ones2d) # or we might need an identity matrix, a square matrix whose diagonal values are ones and all the rest zeros # In[15]: np.eye(3) # Sometimes we need a sample of real numbers from an interval that are equally spaced. The linspace() function can be used to create such samples. It takes as input the two endpoints of the interval and the number of sample data points equally spaced within that interval. It is similar to range() but instead of setting the step we set the number of data points in the sample. # In[16]: x = np.linspace(0., 1., 11) x # ### Reshaping # The data points to be represented in a matrix, or in another multi-dimesional array, may come from a sequence so that once they are in a unidimensional array we have to change its shape adding the missing dimensions and putting the data points in the right place. After a reshaping operation The number of elements will stay the same but the shape of the array will be different. # In[17]: l = 1, 2, 3, 4, 5, 6, 7, 8, 9 m1 = np.array(l) m2 = m1.reshape(3,3) # this is the same array but represented as a 3x3 matrix print(m2) # We can also flatten the data back to its original shape # In[18]: m2.flatten() # As said at the beginning of this section an array contains mutable object, that is, each object can change value (not type) # In[19]: m2[1, 1] = 0 print(m2) # ### Resizing # We may need to create a new array, for example the column vectors from a matrix, by resizing an array. After the risizing the shape will be different but the dimensions will stay the same so to create a column vector we have to create a nwe array and copy the values of the resized (or of the original matrix) into it. # In[20]: v_shape = 3 v1 = np.resize(m2, (v_shape, 1)) v1 v2 = np.zeros(3) for i in range(0, 3): v2[i] = v1[i]; v2 # ### Boolean arrays # We can apply logical operators to arrays as well as to numbers. We apply a logic operator to a random matrix, a matrix whose elements are a sample of pseudo-random numbers from a uniform distribution in the interval [0, 1) # In[21]: R = np.random.rand(3, 3) A = R > 0.5 A # Instead of representing the logical values with True or False we can represent them with the integers 1 and o respectively # In[22]: A.astype(int) # We can filter the elements of an array according to the logical rule defined before # In[23]: R[A] # We can apply different rules on the array elements depending on the result of the logical operator # In[24]: np.where(R > 0.5, R - 0.5, R + 0.5) > 0.5 # In[25]: I = 5000 get_ipython().run_line_magic('time', 'mat = np.random.standard_normal((I, I))') # ### Concatenation # Let's say we have four 2x2 matrices A, B, C and D and we want to concatenate them in one 4x4 matrix M # # $$ M = \begin{bmatrix} A & B \\ C & D \end{bmatrix} $$ # In[26]: A = np.array([[1, 2], [3, 4]]) B = np.array([[5, 6], [7, 8]]) C = np.zeros([2, 2]) D = np.ones([2, 2]) A, B, C, D # In[27]: M_up = np.concatenate([A, B], axis=1) M_down = np.concatenate([C, D], axis=1) M = np.concatenate([M_up, M_down], axis=0) M # ### Vectorized operators # An important feature of Python arrays is that mathematical operators on arrays do not need to use loops, an operator is applied to all members of an operand array # In[28]: x = np.arange(0., 9., 1) y = 3 * x + 5 y # vectorization is even more useful for multi-dimensional arrays # In[29]: A = np.random.randint(0, 10, (3, 3)) A # In[30]: B = np.random.randint(0, 10, (3, 3)) B # We can compute the sum of two matrices with one single statement without loops. The operator is apĆ¼plied to an element of the first matrix and the corresponding element in the second matrix # In[31]: A + B # ### Broadcasting # Broadcasting allows to use an operator with operands of different shapes, for example a matrix and a scalar # In[32]: B + 3 # In[33]: 2 * B # Two matrices A and B can be used as operand of an operator O if the matrix with the smaller shape can be broadcasted over the bigger one by moving it along one index or both. # In[34]: A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) A # In[35]: B = np.array((1, 2, 3)) B # Clearly we can move B over A one row at a time. # In[36]: A * B # Broadcasting doesn't change the associative rule of the operator. # In[37]: B * A # ### Matrix multiplication # In[38]: C = np.array([[1, -2], [5, -9]]) C # In[39]: D = np.array([[-9, 2], [-5, 1]]) D # In[40]: C @ D # Since D is the inverse of C, that is $D = C^{-1}$ then CD = DC = I, where I is the identity matrix. # In[41]: D @ C # We can apply an operator C to a vector x in two ways: directly or through their transpose $\hat{C}\vec{x} = \vec{x}^T\hat{C}^T$ # In[45]: x = np.array([1, 1]) C @ x # In[43]: x.T @ C.T # ### Matrix multiplication for collaborative filtering # A case is the computation of item similarities. Given an interaction matrix A where the rows are users and the columns represent # items, a 1 represent an interaction between a user and an item. A similarity measure between items can be computed from the interaction matrix A as shown below. # In[122]: A = np.array([[1,1,0,1], [1,1,1,0], [1,0,0,1]]) # 3 users and 4 items AT = A.transpose() S = AT.dot(A) # S is symmetric S # We can also consider the ratings of the users on the items, e.g. on a scale from 1 to 5 (0 means not rated) # In[123]: A = np.array([[5,3,0,2], [4,0,3,1], [3,0,0,1]]) AT = A.transpose() S = AT.dot(A) S # We can now recommend items to a user by multiplying the user's ratings by the similarity matrix. The output is a vector of values that represent how much the user should rate each item. We will have to remove from the list the items that have been already rated by the user in order to recommend previously unseen items. # In[124]: u1 = A[0] # user 1 interactions r1 = S.dot(u1) print(r1) # In[125]: u1_nr = (u1 == 0) # find items not rated by user 1 item_i = np.where(u1_nr) # find the index of the 1st not rated item print(r1.dot(u1_nr), item_i) # In[126]: u2 = A[1] r2 = S.dot(u2) print(r2) # In[127]: u2_nr = (u2 == 0) item_i = np.where(u2_nr) print(r2.dot(u2), item_i) # In[128]: u3 = A[2] S.dot(u3) # ## Record array # A record array can contain objects of different types in different colums, e.g. all integers in a column and all strings in another column # In[23]: recarray = np.zeros((2,), dtype=[('Integers','i4'),('Float','f4'),('Strings','a10')]) # a record array of two empty records (dtype is optional) print(recarray) # In[35]: # prepare three arrays each of the same type: integer, float, string column1 = np.array((1,2), dtype='int') column2 = np.array((1.5,2.5), dtype='float') column3 = np.array(('Rome','Paris'), dtype='str') recarray = [column1, column2, column3] # puts together the three arrays in a record array print(recarray[:]) print(recarray[2]) # In[36]: # another way of creating a record array recarray2 = np.rec.array([(1,1.5,'New York'),(2,3.7,'Moscow')],dtype=[('Integers','i4'),('Float','f4'),('Strings','a10')]) print(recarray2.Strings) # prints all the values in column 'Strings' # ## Pandas # # ### Read & Write # Pandas works much better than NumPy in reading and writing files # In[40]: import pandas as pd path = 'covid19/data/tui-dat.csv' df = pd.read_csv(path) df[0:2] # ## Linear Algebra # In[41]: # Solve the linear system AX = B A = np.array([[3, 6, -5], [1, -3, 2], [5, -1, 4]]) B = np.array([[12], [-2], [10]]) Ainv = np.linalg.inv(A) # inverse of A X = Ainv.dot(B) # X = A^(-1)B print(X) print(A.dot(X)) # returns B # In[42]: # Determinant of a squared matrix # 1 2 3 # S = 0 2 1 # 0 0 -1 S = np.array([[1,2,3], [0,2,1], [0,0,-1]]) det = np.linalg.det(S) print(det) l = np.linalg.eigvals(S) # eigenvalues print(l) # ## Histograms # In[50]: import matplotlib.pyplot as plt mu, sigma = 0.0, 1.0 # mean value and standard deviation sample_size = 10000 v = np.random.normal(mu,sigma, sample_size) # Normal distribution fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5)) n = ax.hist(v, bins=100) # n contains the number of samples in each bin, the one-dimensional grid values and the plot object # # SciPy # Let's import the SciPy library and the Matplotlib library for visualization # In[4]: from scipy.optimize import curve_fit # ## Linear fitting # In[5]: # Creating a linear function to model and create data def linearfunc(x, a, b): return a * x + b x = np.linspace(0, 10, 100) # start = 0, stop = 0, samples = 100 y = linearfunc(x, 1, 2) # linear function defined in [0,10] plt.figure() plt.plot(x, y) # Adding noise to the data yn = y + 0.9 * np.random.normal(size=len(x)) plt.plot(x,yn) # Executing curve_fit on noisy data popt, pcov = curve_fit(linearfunc, x, yn) # estimates the parameters of the linear function a, b print(popt) yfit = linearfunc(x,popt[0],popt[1]) # the fitted linear function overlaps with the original one plt.plot(x,yfit) # ## Gaussian fitting # In[6]: # Creating a Gaussian function to model and create data def gaussfunc(x, a, b, c): return a*np.exp(-(x-b)**2/(2*c**2)) # Generating clean data x = np.linspace(0, 10, 100) y = gaussfunc(x, 1, 5, 2) # Adding noise (gaussian) to the data (also gaussian) yn = y + 0.2 * np.random.normal(size=len(x)) plt.figure() plt.plot(x,yn) # plot the gaussian function with random noise - red color # Executing curve_fit on noisy data popt, pcov = curve_fit(gaussfunc, x, yn) # estimates the parameters of the gaussian function a, b, c print(popt) yfit = gaussfunc(x,popt[0],popt[1],popt[2]) # plot the fitted gaussian plt.plot(x,yfit) # ## Solve equations # In[7]: from scipy.optimize import fsolve curve = lambda x: (x - 1)*(x - 2) solution1 = fsolve(curve, 0) print(solution1) solution2 = fsolve(curve,3) print(solution2) # ## Univariate interpolation # In[8]: from scipy.interpolate import interp1d x = np.linspace(0, 3*np.pi, 10) y = np.sin(x) # create a linear interpolation function linearfunc = interp1d(x, y, kind='linear') # create a quadratic interpolation function quadraticfunc = interp1d(x, y, kind='quadratic') # interpolate on a grid of 1,000 points x_interp = np.linspace(0, 3*np.pi, 100) linear_interp = linearfunc(x_interp) quadratic_interp = quadraticfunc(x_interp) # plot the results plt.figure() # new figure plt.plot(x, y,'o') # plot the data points plt.plot(x_interp, linear_interp, x_interp, quadratic_interp); # plot the linear and quadratic interpolations plt.legend(['data', 'linear', 'quadratic'], loc='best') # In[9]: # Interpolation of noisy data from scipy.interpolate import UnivariateSpline sample = 30 x = np.linspace(0.5, 10*np.pi, sample) y = np.cos(x) + np.log10(x) + np.random.randn(sample) / 10 linearfunc = interp1d(x, y, kind='linear') splinefunc = UnivariateSpline(x, y, s=1) x_interp = np.linspace(0.5, 10*np.pi, 1000) linear_interp = linearfunc(x_interp) spline_interp = splinefunc(x_interp) plt.figure() plt.plot(x,y,'o') plt.plot(x_interp, linear_interp, x_interp, spline_interp) plt.legend(['data', 'linear', 'spline'], loc='best') # ## Multivariate Interpolation # In[2]: def func(x, y): return np.sqrt(x**2 + y**2)+np.sin(x**2 + y**2) # In[3]: # creates a 2D grid of 1000x1000 points with coordinates values from 0 to 5 for both x and y grid_x, grid_y = np.mgrid[0:5:100j, 0:5:100j] # sample data points xy = np.random.rand(1000, 2) z = func(xy[:,0]*5, xy[:,1]*5) # In[12]: from scipy.interpolate import griddata # interpolating data grid_z0 = griddata(xy*5, z, (grid_x, grid_y), method='cubic') # In[13]: plt.subplot(121) plt.imshow(func(grid_x, grid_y).T, extent=(0,1,0,1), origin='lower') # shows the image generated on the grid points plt.plot(xy[:,0], xy[:,1], 'k.', ms=1) # print the ramdom sample points plt.title('Original') plt.subplot(122) plt.imshow(grid_z0.T, extent=(0,1,0,1), origin='lower') # shows the interpolated image plt.title('Interpolated') # ## 3D Plotting # In[72]: def f(x, y): return np.exp(-(x*x + y*y) / 1.0) #return 1 / (1 + np.exp(-5*x - 4*y)) # In[73]: x = np.linspace(-1, 1, 100) y = np.linspace(-1, 1, 100) X, Y = np.meshgrid(x, y) # generates a grid from one-dimensional arrays z = f(X, Y) # In[74]: fig = plt.figure() ax = plt.axes(projection='3d') ax.contour3D(X, Y, z, 100, cmap='binary') # In[ ]: