#!/usr/bin/env python # coding: utf-8 # #### In IPython (Jupyter) Notebook we can use "magic" commands that allow us to execute external functions or commands such as standard unix commands. # In[1]: pwd # In[2]: cd .. # In[3]: cd C:\Users\bmobashe\Desktop\Class\Data # In[4]: more populations.txt # #### Let's first look at how we can use standard Python to read in the data file. We can read the file one line at a time and split each line using tab as a delimiter. Each row is added as a list to a table (in this case a list of lists). # In[6]: datafile = open("populations.txt", "r") #first line of the file contains labels for column names labels_line = datafile.readline().strip() poptable = [] total = 0 for line in datafile.readlines(): line = line.strip() row = line.split("\t") record = [int(row[0]),float(row[1]),float(row[2]), float(row[3])] print(record) poptable.append(record) total += 1 print(total) # In[9]: poptable # In[8]: # printing the top5 elements (rows) of poptable from index 0 upto (but not including) index 5 poptable[0:5] # #### Following is an example of creating Python dictionaries for each of the species in the data. The year (in the first) column is used as the key for the population value of each of the species for that year. # In[19]: hares = {} lynxes = {} carrots = {} for row in poptable: hares[row[0]] = row[1] lynxes[row[0]] = row[2] carrots[row[0]] = row[3] list(hares.items()) # #### This is useful if we need to access information regarding one of the species for a specified year. # In[20]: print(hares[1903]) # In[21]: # finding the years during which the population of hares was greater than 50K # Here we'll use standard Python list comprehensions hares_above_50k = [yr for yr in hares.keys() if hares[yr]>50000.0] hares_above_50k # In[22]: # Finding the year(s) with maximal value of Hares maxhares = [yr for yr in hares.keys() if hares[yr] == max(hares.values())] for i in range(0,len(maxhares)): print(maxhares[i], hares[maxhares[i]]) # #### So far, we have not used Numpy, only standard Python. But, many operations on data involving tables or matrices are much simpler and more efficient using Numpy. Let's now try using NumPy arrays: # In[23]: import numpy as np # #### If we want to use the Python list already created, we must first convert it to a 2D NumPy array: # In[25]: pop = np.array(poptable) print(pop) # #### However, we could have, from the start, loaded the data into a Numpy array. One way to do this is via "loadtxt" which reads numerical values from tab-delimited or CSV files (converted into floats). # In[26]: poptable = np.loadtxt('populations.txt', skiprows=1) print(poptable) # In[36]: poptable.shape # #### Note that poptable is a 21x4 2-d Numpy array (a matrix) # #### In fact, we can assign each column directly into separate 1-d arrays, by splitting the transpose of poptable: # In[28]: year, hares, lynxes, carrots = poptable.T np.set_printoptions(linewidth=100) print(year) print(hares) print("Mean Hare Population: ", hares.mean()) # #### Numpy allows us to easily perform operations on rows, columns, or to the whole array: # In[29]: # finding all years when the population of one of the species is above 50k above_50k = np.any(poptable>50000, axis=1) # axis=1 means the operation will be performed across columns print(above_50k) print(year[above_50k]) # #### Let's print some summary statistics on each of the species: # In[31]: pop_no_year = poptable[:,1:] # Removing the first column ("Year") print(" Hares Lynxes Carrots") print("Mean:", pop_no_year.mean(axis=0)) print("Std: ", pop_no_year.std(axis=0)) # In[32]: # Finding indecies of years when one of the populations was at max j_max_years = np.argmax(pop_no_year, axis=0) # ranging over rows for each column print("Indecies for the maximums:", j_max_years) print(" Hares Lynxes Carrots") print("Max. year:", year[j_max_years]) # In[33]: # Ranging over cols for each row, find the specie with the highest pop for each year max_species = np.argmax(pop_no_year, axis=1) species = np.array(['Hare', 'Lynx', 'Carrot']) print(max_species) print("Max specie from 1900 to 1920:") print(species[max_species]) # #### We can look at correlations among the three population variables # In[35]: corr_matrix = np.corrcoef(pop_no_year.T) print(corr_matrix) # #### Now let's look at multi-dimensional sorting. In order to sort by a particular field in the data, we need to convert into a strucutred array with keys. # In[36]: pop_with_keys = poptable.view(dtype=[('year', 'float'), ('hares', 'float'), ('lynxes', 'float'), ('carrots', 'float')]) # In[39]: pop_with_keys # #### Now, array columns can be accessed directly with the appropriate key label: # In[41]: print(pop_with_keys['hares']) # #### Now we can do sorting using the desired label. For example, we can sort the table using the 'hares' field: # In[42]: sorted_by_hares = np.sort(pop_with_keys, order='hares', axis=0) print(sorted_by_hares) # #### Example of basic visualization with matplotlib: # In[43]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[44]: plt.plot(year, hares) # In[45]: plt.plot(year, hares, label='Hares') plt.plot(year, lynxes, label='Lynxes') plt.plot(year, carrots, label='Carrots') plt.legend( ('Hares','Lynxes','Carrots') ) plt.ylabel('Population') plt.xlabel('Year') plt.show() # In[46]: plt.hist(carrots, bins=8, alpha=0.5) plt.xlabel('Carrots') plt.ylabel('Count') plt.title('Histogram of Carrot Populaions') plt.axis([35000, 50000, 0, 6]) plt.grid(True) # In[47]: plt.scatter(hares, carrots, color="blue", marker="*") plt.xlabel('Hares') plt.ylabel('Carrots') plt.title('Hares v. Carrots') plt.grid(True) # In[48]: fig = plt.figure(figsize=(12, 4)) # Create an Axes object. ax1 = fig.add_subplot(1,2,1) # one row, two column, first plot # Plot the data. ax1.scatter(hares, carrots, color="red", marker="*") ax1.set_title("Hares vs. Carrots") # Add some axis labels. ax1.set_xlabel("Hare Population") ax1.set_ylabel("Carrot Population") ax2 = fig.add_subplot(1,2,2) # one row, two column, 2nd plot # Plot the data. ax2.scatter(hares, lynxes, color="blue", marker="^") ax2.set_title("Hares vs. Lynxes") # Add some axis labels. ax2.set_xlabel("Hare Population") ax2.set_ylabel("Lynx Population") # Produce an image. # fig.savefig("scatterplot.png") # In[49]: