#!/usr/bin/env python
# coding: utf-8

# # Data scraping, data wrangling, data analytics, exploratory data analysis, advaced plotting and clustering with love.

# In this tutorial, we are going to read the csv file and the json file we saved in the previous tutorial.
# 
# Then we are going to use **pandas** [1] to do data wrangling and manipulation of the `DataFrame`.  
# 
# Finally, we are going to do some basic data analytics and basic plotting using **matplotlib** [2].

# In[ ]:


# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[ ]:


# In[3]:


import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns


# In[ ]:


# To load the csv file, we proceed as follows,

# In[4]:


#shot_df.to_csv(path_or_buf='test.csv',mode='w')
shot_df = pd.read_csv(filepath_or_buffer='test.csv')
#shot_df = pd.read_csv('test.csv')


# In[ ]:


# To print the first 4 rows in the `DataFrame` and display all the columns, you can proceed as follows,

# In[5]:


shot_df.head(4)


# Have in mind that this is a larger `DataFrame`, if you do not use **.head(4)** it will print all the rows in the `DataFrame`.

# In[ ]:


# To force **pandas** to display all the columns, you can proceed as follows (we only display the first 4 rows),

# In[6]:


pd.set_option('display.max_columns', None)
shot_df.head(4)
#shot_df


# In[ ]:


# To erase a column in a `DataFrame` we can proceed as follows. 
# 
# Notice that we want to erase the column with the name `Unnamed: 0`, hence the notation **('Unnamed: 0', 1)**.

# In[7]:


shot_df1 = shot_df.drop('Unnamed: 0', 1)


# In[ ]:


# To display the information in `shot_df1` (we only display the first 2 rows),

# In[8]:


shot_df1.head(2)


# As you can see, we erased the column **('Unnamed: 0', 1)**.

# In[ ]:


# Alternatively we can use the column 0 as the row labels of the DataFrame.

# In[9]:


shot_df2 = pd.read_csv(filepath_or_buffer='test.csv',index_col=0)


# In[ ]:


# To display the information in `shot_df2` (we only display the first 2 rows),

# In[10]:


shot_df2.head(2)


# In[ ]:


# We can do indexing and slicing in a `DataFrame`.
# 
# To display all the columns belonging to rows 0 to 2 of the `DataFrame shot_df2`, 

# In[11]:


shot_df2.iloc[0:2,:]


# In[ ]:


# or we can display columns 10 to 13 of the first five rows,

# In[75]:


shot_df2.iloc[0:5,10:13]


# In[ ]:


# To load a json file using the module **json** (the hard way),

# In[12]:


#import json
#import pprint
#from pprint import pprint

#with open('data_json.json') as data_file:    
#    data_json = json.load(data_file)

#pprint(data_json)


# In[13]:


#type(data_json)


# In[14]:


#data_json['resultSets'][0]['headers']


# In[15]:


#data_json['resultSets'][0]['rowSet']


# In[16]:


#for x in data_json:
#    print (x)


# In[17]:


#for x in data_json['resultSets'][x]
#    print (data_json['resultSets'][x])


# In[18]:


#data_json
#type(data_json['resultSets'][0])
#print (data_json['resultSets'][0])


# In[ ]:


# In[ ]:


# To load the json file using **pandas**, we proceed as follows,

# In[19]:


#pd_json=pd.read_json(path_or_buf='data_json.json')
pd_json=pd.read_json(path_or_buf='data_json.json',typ='series')


# In[ ]:


# Remember, you can inspect the json file using **JSONView**, as in the previous tutorial.

# To know the type of the json file we just loaded,

# In[20]:


type(pd_json)


# Notice that it is a `series` and not a `DataFrame`.  Later on we are going to see how to convert `series`  to a `DataFrame`, for the moment this does not generate any problem.

# In[ ]:


# The next line will print the content of the json file.  This is the same information you see when you use **JSONView**, but now we read in the hard way.

# In[21]:


pd_json


# In[ ]:


# These lines will print the information inside each block of the json file, they are commented as they print a lot information.

# In[22]:


#pd_json.parameters


# In[23]:


#pd_json.resource


# In[24]:


#pd_json.resultSets


# In[ ]:


# At this point, we can create the `DataFrame` using the json file we imported in **pandas** (pretty much as in the previous tutorial).
# 
# First we need to grab the headers and shot chart data.

# In[25]:


# Grab the headers to be used as column headers for our DataFrame
json_headers = pd_json['resultSets'][0]['headers']


# In[26]:


# Grab the shot chart data
json_shots = pd_json['resultSets'][0]['rowSet']


# In[ ]:


# To display the content of `json_headers`

# In[27]:


json_headers


# In[ ]:


# To display the content of `json_shots`. This line is commented as it prints a lot information.

# In[28]:


#json_shots


# In[ ]:


# To create the `DataFrame` using `json_headers` and `json_shots`, 

# In[29]:


json_to_pd = pd.DataFrame(json_shots, columns=json_headers)


# In[ ]:


# and to display the first 5 rows of `json_to_pd`,

# In[30]:


json_to_pd.head()


# In[ ]:


# To determine if `json_to_pd` is equal to `shot_df1` (we only display the first 5 rows),

# In[31]:


json_to_pd.head() == shot_df1.head()


# Notice that the only difference is the column **GAME_ID**.  When loading the csv, **pandas** erased the leading zeroes, the rest of the columns are equal.
# 
# This can be fixed, but I will let you as an exercise.

# In[ ]:


# The `DataFrame` `shot_df1` contains the shot chart data of all the field goal attempts Lebron James took during the 2014-15 regular season. 
# 
# We are specifically interested in the data saved in the columns `LOC_X`, `LOC_Y` and `SHOT_MADE_FLAG`. The columns `LOC_X` and `LOC_Y` contain the coordinate values for each shot measured from the basket rim.  The column `SHOT_MADE_FLAG` contains the outcome of the shot, 0 for missed and 1 for converted.
# 
# To extract from the `DataFrame` `shot_df1` the converted shots or **shot_df1.SHOT_MADE_FLAG == 1**, we proceed as follows (we only display the first 2 rows),
# 

# In[32]:


shot_df1[shot_df1.SHOT_MADE_FLAG == 1].head(2)


# In[ ]:


# To save the previous output in the `DataFrame` `converted`,

# In[33]:


converted = shot_df1[shot_df1.SHOT_MADE_FLAG == 1]


# In[ ]:


# To get the dimensions of the `DataFrame`

# In[34]:


converted.shape


# In[ ]:


# To save the output of the missed shots in the `DataFrame` `missed`,

# In[35]:


missed = shot_df1[shot_df1.SHOT_MADE_FLAG == 0]


# In[ ]:


# Let's create a new `DataFrame` as follows,

# In[36]:


new_df = pd.DataFrame()


# In[ ]:


# and let's extract some information from the `DataFrame` `shot_df1` and put it on the `DataFrame` `new_df`.  
# 
# Notice that we can access the information in a `DataFrame` as 
# 
# `shot_df1.LOC_X` 
# 
# or
# 
# `shot_df1['LOC_X']`.

# In[37]:


new_df['LOC_X'] = shot_df1.LOC_X
new_df['LOC_Y'] = shot_df1['LOC_Y']
new_df['SHOT_MADE_FLAG'] = shot_df1['SHOT_MADE_FLAG']
new_df['SHOT_TYPE'] = shot_df1['SHOT_TYPE']


# In[ ]:


# To display the information in the `DataFrame` `new_df` (we only display the first 5 rows),

# In[38]:


#pprint(new_df.head(4))
new_df.head()


# In[ ]:


# We can also extract information from a `DataFrame` using strings.  From the `DataFrame` `new_df`, let's extract the `SHOT_TYPE` information as follows,

# In[39]:


three_pointer = new_df[new_df.SHOT_TYPE == '3PT Field Goal']
two_pointer = new_df[new_df.SHOT_TYPE == '2PT Field Goal']


# In[ ]:


# To display the information contained in the `DataFrame` `three_pointer` and `two_pointer` (we only display the first 2 rows),

# In[40]:


three_pointer.head(2)


# In[41]:


two_pointer.head(2)


# In[ ]:


# To know the type of `two_pointer`,

# In[42]:


type(two_pointer)


# In[ ]:


# Notice that if we extract the column `two_pointer['LOC_X']` and save it in the object `tmp_object`, as follows,

# In[43]:


tmp_object = two_pointer['LOC_X']


# In[ ]:


# The type of the object `tmp_object` is a **pandas** `series` instead of a `DataFrame`. 

# In[44]:


type(tmp_object)


# In[ ]:


# To convert `tmp_object` to a `DataFrame`, we proceed as follows. 
# 
# Notice that the option `name` in `to_frame(name='LOC_X')`, corresponds to the name we want to assign to that column in the `DataFrame tmp_object1`.

# In[45]:


tmp_object1 = two_pointer['LOC_X'].to_frame(name='LOC_X')


# In[ ]:


# which has a type

# In[46]:


type(tmp_object1)


# In[ ]:


# To convert a pandas series or DataFrame to a numpy array, we proceed as follows,

# In[47]:


np_array0 = shot_df1.as_matrix(columns=None)

np_array1 = two_pointer['LOC_X'].as_matrix(columns=None)


# In[ ]:


# which has a type

# In[48]:


type(np_array0)


# In[49]:


type(np_array1)


# In[ ]:


# We can also do mathematical operations between columns (and rows) of a `DataFrame`, for example (we only display the first 5 rows),

# In[50]:


(three_pointer['LOC_X']*100).head()
#(three_pointer*0).head()


# In[51]:


((three_pointer['LOC_X']-three_pointer.LOC_Y)/three_pointer.LOC_Y**2).head()


# In[52]:


(three_pointer['LOC_X']/three_pointer['LOC_X'].max()).head()


# In[ ]:


# To create a new `DataFrame` with the normalized values of `LOC_X` and `LOC_Y`, 

# In[53]:


df_norm = pd.DataFrame()
c1=three_pointer['LOC_X']/three_pointer['LOC_X'].max()
c2=three_pointer['LOC_Y']/three_pointer['LOC_Y'].max()
df_norm['c1'] = c1
df_norm['c2'] = c2
df_norm.head()


# Notice that we only display the first 5 rows.

# To generate a summary statistics of a `DataFrame`.

# In[54]:


shot_df1.describe()


# Notice that it computes the statistics only of the columns with numerical values.

# In[ ]:


# To count the total number of 3 point shots attempted,

# In[55]:


three_pointer.count()


# In[ ]:


# To count the number of 3 point shots converted and only for the column **SHOT_MADE_FLAG**,

# In[56]:


three_pointer.SHOT_MADE_FLAG[three_pointer.SHOT_MADE_FLAG == 1].count()


# In[ ]:


# and to count the number of 3 point shots missed,

# In[57]:


three_pointer.SHOT_MADE_FLAG[three_pointer.SHOT_MADE_FLAG == 0].count()


# In[ ]:


# In[ ]:


# At this point, let's do some basic plotting using **matplotlib** [2].  
# 
# The first plot will be the converted and missed shots using the `DataFrames` we just created, that is, we are going to plot the shot charts.

# In[58]:


sns.set_style("white")
sns.set_color_codes()

#shottrue = shot_df1[shot_df.SHOT_MADE_FLAG == 1]
#shotfalse = shot_df1[shot_df.SHOT_MADE_FLAG == 0]

plt.figure(figsize=(12,11))

plt.scatter(converted.LOC_X, converted.LOC_Y, color='green',label='converted',s=20,marker='o',alpha=0.5)
plt.scatter(missed.LOC_X, missed.LOC_Y, color='red',label='missed',s=20,marker='o',alpha=0.5)

#plt.scatter(shottrue.LOC_X, shottrue.LOC_Y)
#plt.scatter(shotfalse.LOC_X, shotfalse.LOC_Y)

plt.legend()
plt.grid()

plt.xlim(-300,300)
plt.ylim(-100,500)
#plt.grid()
plt.show()


# **FYI, the values in LOC_X and LOC_Y are in inches**

# In[ ]:


# Let's do a plot of all the 3 point shots.

# In[59]:


sns.set_style("white")
sns.set_color_codes()

#shottrue = shot_df1[shot_df.SHOT_MADE_FLAG == 1]
#shotfalse = shot_df1[shot_df.SHOT_MADE_FLAG == 0]

plt.figure(figsize=(12,11))

plt.scatter(three_pointer.LOC_X, three_pointer.LOC_Y, color='green',label='3 point shots',s=20,marker='o',alpha=0.5)


#plt.scatter(shottrue.LOC_X, shottrue.LOC_Y)
#plt.scatter(shotfalse.LOC_X, shotfalse.LOC_Y)

plt.legend()

plt.xlim(-300,300)
plt.ylim(-100,500)
#plt.grid()
plt.show()


# In[ ]:


# We can also plot all the 3 point shots by converted and missed, as follows,

# In[60]:


sns.set_style("white")
sns.set_color_codes()

#shottrue = shot_df1[shot_df.SHOT_MADE_FLAG == 1]
#shotfalse = shot_df1[shot_df.SHOT_MADE_FLAG == 0]

plt.figure(figsize=(12,11))

plt.scatter(three_pointer.LOC_X[three_pointer.SHOT_MADE_FLAG == 1], three_pointer.LOC_Y[three_pointer.SHOT_MADE_FLAG == 1], color='green',label='3 point shots converted',s=20,marker='o',alpha=0.5)
plt.scatter(three_pointer.LOC_X[three_pointer.SHOT_MADE_FLAG == 0], three_pointer.LOC_Y[three_pointer.SHOT_MADE_FLAG == 0], color='red',label='3 point shots missed',s=20,marker='o',alpha=0.5)


#plt.scatter(shottrue.LOC_X, shottrue.LOC_Y)
#plt.scatter(shotfalse.LOC_X, shotfalse.LOC_Y)

plt.legend()

plt.xlim(-300,300)
plt.ylim(-100,500)
#plt.grid()
plt.show()


# In[ ]:


# Finally, we can use **seaborn** [3] to do more advanced plots,

# In[61]:


# create our jointplot
joint_shot_chart = sns.jointplot(shot_df1.LOC_X, shot_df1.LOC_Y, stat_func=None,
                                 kind='scatter', space=0.2, alpha=1, 
                                 size=12, edgecolor='w', color='g').set_axis_labels("x location", "y location")

plt.figure(figsize=(12,11))


# In[ ]:


# ### In the next tutorial, we are going to work a little bit more on data analytics and advanced plotting.

# In[ ]:


# # References
# 
# [1] http://pandas.pydata.org/
#     
# [2]    http://matplotlib.org/
# 
# [3] http://stanford.edu/~mwaskom/software/seaborn/

# In[ ]:


# In[ ]:


# In[ ]:


# In[62]:


#import sys
#print('Python version:', sys.version_info)

#import IPython
#print('IPython version:', IPython.__version__)

#print('Requests version', requests.__version__)
#print('Pandas version:', pd.__version__)
#print('json version:', json.__version__)

#import matplotlib
#print('matplotlib version:', matplotlib.__version__)

#print('seaborn version:', sns.__version__)


# In[ ]: