#!/usr/bin/env python
# coding: utf-8

# # Week 05 - Numpy II and Scipy
# 
# ## Today's Agenda
# - Numpy II
# - Scipy

# # Numpy II
# 
# Last time in [Week 05](https://github.com/VandyAstroML/Vanderbilt_Computational_Bootcamp/blob/master/notebooks/Week_05/05_Numpy_Matplotlib.ipynb), we covered `Numpy` and `Matplotlib`. This time we will be focusing on more advanced concepts of `Numpy`.

# In[1]:


# Loading modules
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import matplotlib.pyplot as plt


# ## Review
# As a review, let's explore some of the concepts that were introduced last time, in `Numpy I`.
# 
# ### Create 1D-arrays
# We introduced how to create a 1D-array

# In[2]:


x = np.array([1,2,3,5,6,7,8,10],dtype=float)
x


# In[3]:


y = np.arange(10)
y


# In[4]:


z = np.linspace(0,100,50)
z


# In[5]:


h = np.random.randn(100)
h


# ### Handling arrays
# 
# These are just a few of the different ways to __create__ numpy arrays.
# 
# You can also use functions like __np.max()__ and __np.min()__ to get the maximum and minimum values, respectively.

# In[6]:


print('Min X: {0:.3f} \t Max X: {1:.3f}'.format(np.min(x), np.max(x)) )


# #### Apply mathematical functions

# In[7]:


zz = x**2 + 3*x**3
zz


# #### Conditionals
# Find the indices of the elements in an array that meet some criteria.
# In this example, we're finding all the elements that are within 100 and 500 in array ___"```zz```"___.

# In[8]:


zz_idx = np.where((zz>= 100)&(zz <= 500))[0]
print('zz_idx: {0}'.format(zz_idx))

zz[zz_idx]


# ## Manipulating Arrays
# There are a lot of things we can do to a _numpy array_. 

# In[9]:


h1 = np.random.randint(10, 50, 50)
h1


# We can get the overall __size__ and __shape__ of the array.
# 
# We cane use the functions `numpy.size` and `numpy.shape` to get the total number of elements in an array and the shape of the array, respectively.

# In[10]:


np.size(h1)


# In[11]:


h1.shape


# In[12]:


A = np.array([[1,2,3,4,5],
              [6,7,8,9,10],
              [12,13,14,16,17],
              [13,45,67,89,90] ])
A


# In[13]:


np.shape(A)


# You can also __transpose__ array `A`.

# In[14]:


A_t = np.transpose(A)
A_t


# Why are `Numpy` arrays better than lists:
# - Python lists are very _general_.
# - Lists __do not__ support matrix and dot multiplications, etc.
# - `Numpy` arrays are memory _efficient_.
# - Numpy arrays are __statically typed__ and __homogeneous__.
# - They are fast at mathematical functions.
# - They can be used in compiled languages, e.g. _C_ and _Fortran_.

# ### Array-generating functions
# For large arrays it is inpractical to initialize the data manually, using normal Python lists. Instead, we can use many of the Numpy functions to generate arrays of different forms.

# #### __numpy.arange__
# We use this one to create a sequence of ordered elements

# In[15]:


np.arange(0,10,1)


# In[16]:


np.arange(0,20,5)


# In[17]:


np.arange(-40,21,10)


# #### linspace and logspace
# We use these functions to created ordered lists, separated by intervals in _real_- and _log_-space.

# In[18]:


B = np.linspace(0,50)
B


# In[19]:


B = np.linspace(0,100, 20)
B


# Array of __25 elements__ from $10^{0}$ to $10^{3}$, with __base of 10__.

# In[20]:


B = np.logspace(0,3,25)
B


# Creating an array of 11 elements from $e^{0}$ to $e^{10}$, with the ```base == numpy.e```

# In[21]:


B = np.logspace(0,10,11, base=np.e)
B


# #### Random Data

# In[22]:


from numpy import random


# In[23]:


# Uniform random numbers in [0,1]
random.rand(5,5)


# In[24]:


# 20 Random integers from 10 to 30
random.randint(10,30,20)


# #### Arrays of `zeros` and `ones`.

# In[25]:


np.zeros(20)


# You can use these to populate other arrays

# In[26]:


nelem = 10
C = np.ones(10)
C


# In[27]:


for ii in range(C.size):
    C[ii] = random.rand()
C


# ### Diagonals
# You can also construct an array with another array as the diagonal

# In[28]:


np.diag(random.randint(10,20,5))


# ### Indexing

# You can choose which values to select.
# Normally, you select the `rows` first, and then the `cols` of a `numpy.ndarray`.

# In[29]:


M = random.rand(10,5)
M


# Selecting the 1st row

# In[30]:


M[1,:]


# The 2nd column

# In[31]:


M[:,1]


# Select a range of columns and rows

# In[32]:


M[1:3, 2:4]


# You can easily use this to create a __mask__, for when you are _cleaning_ your data.

# In[33]:


A = random.rand(3,3)
np.fill_diagonal(A, np.nan)
A


# In[34]:


B = np.arange(0,9).reshape((3,3))
B


# Appying the __mask__ from $A \to B$

# In[35]:


A_mask = np.isfinite(A)
A_mask


# In[36]:


B[A_mask]


# ### Binning you data
# This is probably one of the best functions of `Numpy`.
# You can use this to bin you data, and calculate __means__, __standard deviations__, etc.

# #### numpy.digitize

# In[37]:


# Creating my bin edges
bins = np.arange(0,13)
bins


# In[38]:


# Generating Data
data = 10*random.rand(100)
data


# Now I want to bin my data and calculate the mean for each bin

# In[39]:


# Defining statistical function to use
stat_func = np.nanmean
# Binning the data
data_bins = np.digitize(data, bins)
data_bins


# Calculating the __mean__ for each of the bins

# In[40]:


failval = -10
bins_stat = np.array([stat_func(data[data_bins == ii]) \
                      if len(data[data_bins == ii]) > 0 \
                      else failval \
                      for ii in range(1,len(bins))])
bins_stat = np.asarray(bins_stat)
bins_stat


# You can put all of this into a function that estimates errors and more...

# In[41]:


import math

def myceil(x, base=10):
    """
    Returns the upper-bound integer of 'x' in base 'base'.

    Parameters
    ----------
    x: float
        number to be approximated to closest number to 'base'

    base: float
        base used to calculate the closest 'largest' number

    Returns
    -------
    n_high: float
        Closest float number to 'x', i.e. upper-bound float.

    Example
    -------
    >>>> myceil(12,10)
      20
    >>>>
    >>>> myceil(12.05, 0.1)
     12.10000 
    """
    n_high = float(base*math.ceil(float(x)/base))

    return n_high

def myfloor(x, base=10):
    """
    Returns the lower-bound integer of 'x' in base 'base'

    Parameters
    ----------
    x: float
        number to be approximated to closest number of 'base'

    base: float
        base used to calculate the closest 'smallest' number

    Returns
    -------
    n_low: float
        Closest float number to 'x', i.e. lower-bound float.

    Example
    -------
    >>>> myfloor(12, 5)
    >>>> 10
    """
    n_low = float(base*math.floor(float(x)/base))

    return n_low

def Bins_array_create(arr, base=10):
    """
    Generates array between [arr.min(), arr.max()] in steps of `base`.

    Parameters
    ----------
    arr: array_like, Shape (N,...), One-dimensional
        Array of numerical elements

    base: float, optional (default=10)
        Interval between bins

    Returns
    -------
    bins_arr: array_like
        Array of bin edges for given arr

    """
    base = float(base)
    arr  = np.array(arr)
    assert(arr.ndim==1)
    arr_min  = myfloor(arr.min(), base=base)
    arr_max  = myceil( arr.max(), base=base)
    bins_arr = np.arange(arr_min, arr_max+0.5*base, base)

    return bins_arr


# In[42]:


def Mean_std_calc_one_array(x1, y1, arr_len=0, statfunc=np.nanmean,
                           failval=np.nan, error='std',
                           base=10.):
    """
    Calculates statistics of two arrays, e.g. scatter,
    error in `statfunc`, etc.
    
    Parameters
    ----------
    x1: array-like, shape (N,)
        array of x-values
    
    y1: array-like, shape (N,)
        array of y-values
    
    arr_len: int, optional (default = 0)
        minimum number of elements in the bin
    
    statfunc: numpy function, optional (default = numpy.nanmean)
        statistical function used to evaluate the bins
    
    failval: int or float, optional (default = numpy.nan)
        Number to use to replace when the number of elements in the 
        bin is smaller than `arr_len`
    
    error: string, optional (default = 'std')
        type of error to evaluate
        Options:
            - 'std': Evaluates the standard deviation of the bin
            - 'stat': Evaluates the error in the mean/median of each bin
            - 'none': Does not calculate the error in `y1`
    base: float
        Value of bin width in units of that of `x1`
    
    Returns
    --------
    x1_stat: array-like, shape (N,)
        `stat_func` of each bin in `base` spacings for x1

    y1_stat: array-like, shape (N,)
        `stat_func` of each bin in `base` spacings for y1
    """
    x1 = np.asarray(x1)
    y1 = np.asarray(y1)
    assert((x1.ndim==1) & (y1.ndim==1))
    assert((x1.size >0) & (y1.size>0))
    n_elem = len(x1)
    ## Computing Bins
    x1_bins  = Bins_array_create(x1, base=base)
    x1_digit = np.digitize(x1, x1_bins)
    ## Computing Statistics in bins
    x1_stat = np.array([statfunc(x1[x1_digit==ii]) 
                                 if len(x1[x1_digit==ii])>arr_len 
                                 else failval
                                 for ii in range(1,x1_bins.size)])
    y1_stat = np.array([statfunc(y1[x1_digit==ii])
                                 if len(y1[x1_digit==ii])>arr_len 
                                 else failval
                                 for ii in range(1,x1_bins.size)])
    ## Computing error in the data
    if error=='std':
        stat_err = np.nanstd
        y1_err = np.array([stat_err(y1[x1_digit==ii]) 
                                     if len(y1[x1_digit==ii])>arr_len 
                                     else failval
                                     for ii in range(1,x1_bins.size)])
    if error!='none':
        y1_err = np.array([stat_err(y1[x1_digit==ii])/np.sqrt(len(y1[x1_digit==ii]))
                          if len(y1[x1_digit==ii])>arr_len
                          else failval
                          for ii in range(1,x1_bins.size)])
        if (stat_func==np.median) or (stat_func==np.nanmedian):
            y1_err *= 1.253
    else:
        y1_err = np.zeros(y1.stat.size)
    
    return x1_stat, y1_stat, y1_err


# __Example of using these function__:

# In[43]:


import numpy as np

# Defining arrays
x_arr = np.arange(100)
y_arr = 50*np.random.randn(x_arr.size)

# Computing mean and error in the mean for `x_arr` and `y_arr`
x_stat, y_stat, y_err = Mean_std_calc_one_array(x_arr, y_arr,
                                               statfunc=np.nanmean,
                                               failval=np.nan,
                                               base=10)
x_stat2, y_stat2, y_err2 = Mean_std_calc_one_array(x_arr, y_arr,
                                               statfunc=np.nanmedian,
                                               failval=np.nan,
                                               base=10)


# In[44]:


plt.style.use('seaborn-notebook')
plt.clf()
plt.close()
fig = plt.figure(figsize=(12,8))
ax  = fig.add_subplot(111,facecolor='white')
ax.plot(x_arr, y_arr, 'ro', label='Data')
ax.errorbar(x_stat, y_stat, yerr=y_err, color='blue', marker='o',
           linestyle='--',label='Mean')
ax.errorbar(x_stat2, y_stat2, yerr=y_err2, color='green', marker='o',
           linestyle='--',label='Median')
ax.set_xlabel('X axis', fontsize=20)
ax.set_ylabel('Y axis', fontsize=20)
ax.set_title('Data and the Binned Data', fontsize=24)
plt.legend(fontsize=20)
plt.show()


# With this function, it is really easy to apply statistics on binned data, as well as to ___estimate errors___ on the data.

# ### Reshaping, resizing and stacking arrays
# One can always modify the shape of a `numpy.ndarray`, as well as append it to a pre-existing array.

# In[45]:


A = np.array([[n+m*10 for n in range(5)] for m in range(5)])
A


# In[46]:


n, m = A.shape


# In[47]:


B = A.reshape((1,n*m))
B


# In[48]:


A_f = A.flatten()
A_f


# In[49]:


C = random.rand(A.size)
C


# In[50]:


C.shape


# In[51]:


# Stacking the two arrays
D = np.column_stack((A_f,C))
D


# In[52]:


# Selecting from 3rd to 11th row
D[2:10]


# #### np.concatenate
# You can also concadenate different arrays

# In[53]:


a = np.array([[1, 2], [3, 4]])
b = np.array([[5,6]])


# In[54]:


np.concatenate((a,b))


# In[55]:


np.concatenate((a,b.T), axis=1)


# ### Copy and "Deep Copy"
# Sometimes it is important to create new _copies_ of arrays and other objects. For this reason, one uses __numpy.copy__ to create _new_ copies of arrays

# In[56]:


A = np.array([[1, 2], [3, 4]])
A


# In[57]:


# `B` is now referring to the same array data as `A`
B = A


# If we make any changes to `B`, __`A` will also be affected by this change__.

# In[58]:


B[0,0] = 10
B


# In[59]:


A


# To get a __completely independent, new object__, you would use:

# In[60]:


B = np.copy(A)

# Modifying `B`
B[0,0] = -5
B


# In[61]:


A


# The array `A` was not affected by this changed. This is important when you're constantly re-defining new arrays

# # Scipy - Library of Scientific Algorithms for Python

# `SciPy` provides a large number of higher-level scientif algorithms.
# It includes:
# - Special Functions
# - Integration
# - Optimization
# - Interpolation
# - Fourier Transforms
# - Signal Processing
# - Linear Algebra
# - Statistics
# - Multi-dimensional image processing

# In[62]:


import scipy as sc


# ## Interpolation
# You can use `Scipy` to interpolate your data.
# You would use the `interp1d` function to interpolate your function.

# In[63]:


from scipy.interpolate import interp1d


# In[64]:


def f(x):
    return np.sin(x)


# In[65]:


n = np.arange(0, 10)  
x = np.linspace(0, 9, 100)

y_meas = f(n) + 0.1 * np.random.randn(len(n)) # simulate measurement with noise
y_real = f(x)

linear_interpolation = interp1d(n, y_meas)
y_interp1 = linear_interpolation(x)

cubic_interpolation = interp1d(n, y_meas, kind='cubic')
y_interp2 = cubic_interpolation(x)


# In[66]:


fig, ax = plt.subplots(figsize=(15,6))
ax.set_facecolor('white')
ax.plot(n, y_meas, 'bs', label='noisy data')
ax.plot(x, y_real, 'k', lw=2, label='true function')
ax.plot(x, y_interp1, 'r', label='linear interp')
ax.plot(x, y_interp2, 'g', label='cubic interp')
ax.legend(loc=3, prop={'size':20});
ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=15)


# ### KD-Trees
# You can also use `SciPy` to calculate __KD-Trees__ for a set of points

# In[67]:


Lbox = 250.
Npts = 1000
# Creating cartesian coordinates
x = np.random.uniform(0, Lbox, Npts)
y = np.random.uniform(0, Lbox, Npts)
z = np.random.uniform(0, Lbox, Npts)
sample1 = np.vstack([x, y, z]).T
sample1


# In[68]:


sample1.shape


# Let's say we want to know how many points are within distances of 30 and 50 from other points. To know this, you construct a __KD-Tree__

# In[69]:


from scipy.spatial import cKDTree


# In[70]:


# Initializing KDTree
KD_obj = cKDTree(sample1)


# In[71]:


N_neighbours = cKDTree.count_neighbors(KD_obj, KD_obj, 50) - \
                cKDTree.count_neighbors(KD_obj, KD_obj, 30)
print("Number of Neighbours: {0}".format(N_neighbours))


# Let's say you want to get the distances to the __Nth-nearest__ neighbor.

# In[72]:


k_nearest = 4
dist_k, dist_k_idx = cKDTree.query(KD_obj, sample1, k_nearest)
dist_k


# You can also get the indices

# In[73]:


dist_k_idx


# The first columns corresponds to _itself_.

# You can also find pairs that are separated by at most a distance _r_

# In[74]:


pairs = KD_obj.query_ball_tree(KD_obj, 30)
pairs[0:10]


# __So that's it for today's lesson__

# # Next week:
# - Advanced Visualization
# - Seaborn