#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd # #### Reading the file into a dataframe: # In[2]: pop_df = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/populations.txt", sep='\t') pop_df # In[3]: pop_df.head(5) # In[4]: pop_df.columns # In[5]: pop_df.values # In[6]: pop_df.dtypes # #### We can access columns (Pandas series) using their labels: # In[7]: hare_df = pop_df["hare"] hare_df # #### Or alternatively using the label as a property of the dataframe: # In[8]: pop_df.hare # #### The usual numeric operations are available for dataframes or series: # In[9]: print("Mean Hare Population: ", hare_df.mean()) # In[13]: print("Mean Populations: \n") print(pop_df[["hare","lynx","carrot"]].mean()) print("\n") print("Standard Deviations: \n") print(pop_df[["hare","lynx","carrot"]].std()) # #### The describe() method provides a detailed description of variables: # In[14]: pop_df[["hare","lynx","carrot"]].describe() # In[15]: pop_df.describe() # #### A better way to do correlation analysis: # In[16]: pop_df[["hare","lynx","carrot"]].corr() # In[17]: pop_df # #### Also sorting is done easily: # In[18]: pop_df.sort_values(by=['hare']) # #### More examples of accessing and manipulating data in dataframes: # In[25]: # finding all instances when the population of hares is above 50k hare_above_50K = pop_df.hare>50000 print(hare_above_50K) print("\n") print(pop_df[hare_above_50K]) print("\n") print(pop_df[hare_above_50K].year) # In[26]: # finding all instances when the population of one of the animal species is above 50k above_50K = (pop_df["hare"]>50000) | (pop_df["lynx"]>50000) pop_df[above_50K] # In[27]: pop2 = pop_df.drop("year", axis=1) pop2 # #### When necessary, we can convert a dataframe (or a series) into a Numpy array: # In[28]: poptable = np.array(pop2) poptable # #### Example of basic visualization using Pandas and with Matplotlib: # In[29]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[30]: plt.plot(pop_df["year"], pop_df["hare"]) # In[31]: plt.plot(pop_df["year"], pop2, label=['Hares','Lynxes','Carrots']) plt.legend( ('Hares','Lynxes','Carrots') ) plt.ylabel('Population') plt.xlabel('Year') plt.show() # In[32]: plt.hist(pop_df["carrot"], bins=8, alpha=0.5) plt.xlabel('Carrots') plt.ylabel('Count') plt.title('Histogram of Carrot Populaions') plt.axis([36000, 49000, 0, 6]) plt.grid(True) # #### Pandas has its own versatile "plot" method that can handle most types of charts: # In[33]: pop_df.plot(x="year", title="Populations") # In[34]: pop_df.plot(x="carrot", y="lynx", kind="scatter") # In[35]: pop_df.boxplot(column=["hare","lynx","carrot"], return_type='axes') # In[36]: fox_col = np.random.randint(low=5000, high=20000, size=21) fox_col # In[37]: pop_df["fox"] = pd.Series(fox_col, index=pop_df.index) pop_df # In[38]: pop_df.plot(x="year", y="fox", kind="area", title="Fox Population") # In[41]: pd.plotting.scatter_matrix(pop_df[["hare","lynx","carrot"]], figsize=(14,14), hist_kwds={'bins':8}, alpha=.5, marker='o', s=50); # In[ ]: