#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd


# #### Reading the file into a dataframe: 

# In[2]:


pop_df = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/populations.txt", sep='\t')
pop_df


# In[3]:


pop_df.head(5)


# In[4]:


pop_df.columns


# In[5]:


pop_df.values


# In[6]:


pop_df.dtypes


# #### We can access columns (Pandas series) using their labels:

# In[7]:


hare_df = pop_df["hare"]
hare_df


# #### Or alternatively using the label as a property of the dataframe:

# In[8]:


pop_df.hare


# #### The usual numeric operations are available for dataframes or series:

# In[9]:


print("Mean Hare Population: ", hare_df.mean())


# In[13]:


print("Mean Populations: \n")
print(pop_df[["hare","lynx","carrot"]].mean())
print("\n")
print("Standard Deviations: \n")
print(pop_df[["hare","lynx","carrot"]].std())


# #### The describe() method provides a detailed description of variables:

# In[14]:


pop_df[["hare","lynx","carrot"]].describe()


# In[15]:


pop_df.describe()


# #### A better way to do correlation analysis:

# In[16]:


pop_df[["hare","lynx","carrot"]].corr()


# In[17]:


pop_df


# #### Also sorting is done easily:

# In[18]:


pop_df.sort_values(by=['hare'])


# #### More examples of accessing and manipulating data in dataframes:

# In[25]:


# finding all instances when the population of hares is above 50k
hare_above_50K = pop_df.hare>50000
print(hare_above_50K)
print("\n")
print(pop_df[hare_above_50K])
print("\n")
print(pop_df[hare_above_50K].year)


# In[26]:


# finding all instances when the population of one of the animal species is above 50k
above_50K = (pop_df["hare"]>50000) | (pop_df["lynx"]>50000)
pop_df[above_50K]


# In[27]:


pop2 = pop_df.drop("year", axis=1)
pop2


# #### When necessary, we can convert a dataframe (or a series) into a Numpy array:

# In[28]:


poptable = np.array(pop2)
poptable


# #### Example of basic visualization using Pandas and with Matplotlib:

# In[29]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt


# In[30]:


plt.plot(pop_df["year"], pop_df["hare"])


# In[31]:


plt.plot(pop_df["year"], pop2, label=['Hares','Lynxes','Carrots'])
plt.legend( ('Hares','Lynxes','Carrots') )
plt.ylabel('Population')
plt.xlabel('Year')
plt.show()


# In[32]:


plt.hist(pop_df["carrot"], bins=8, alpha=0.5)
plt.xlabel('Carrots')
plt.ylabel('Count')
plt.title('Histogram of Carrot Populaions')
plt.axis([36000, 49000, 0, 6])
plt.grid(True)


# #### Pandas has its own versatile "plot" method that can handle most types of charts:

# In[33]:


pop_df.plot(x="year", title="Populations")


# In[34]:


pop_df.plot(x="carrot", y="lynx", kind="scatter")


# In[35]:


pop_df.boxplot(column=["hare","lynx","carrot"], return_type='axes')


# In[36]:


fox_col = np.random.randint(low=5000, high=20000, size=21)
fox_col


# In[37]:


pop_df["fox"] = pd.Series(fox_col, index=pop_df.index)
pop_df


# In[38]:


pop_df.plot(x="year", y="fox", kind="area", title="Fox Population")


# In[41]:


pd.plotting.scatter_matrix(pop_df[["hare","lynx","carrot"]], figsize=(14,14), hist_kwds={'bins':8}, alpha=.5, marker='o', s=50);


# In[ ]: