#!/usr/bin/env python # coding: utf-8 # # Data scraping, data wrangling, data analytics, exploratory data analysis, advaced plotting and clustering with love. # In this tutorial, we are going to read the csv file and the json file we saved in the previous tutorial. # # Then we are going to use **pandas** [1] to do data wrangling and manipulation of the `DataFrame`. # # Finally, we are going to do some basic data analytics and basic plotting using **matplotlib** [2]. # In[ ]: # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') # In[ ]: # In[3]: import pandas as pd import json import matplotlib.pyplot as plt import seaborn as sns # In[ ]: # To load the csv file, we proceed as follows, # In[4]: #shot_df.to_csv(path_or_buf='test.csv',mode='w') shot_df = pd.read_csv(filepath_or_buffer='test.csv') #shot_df = pd.read_csv('test.csv') # In[ ]: # To print the first 4 rows in the `DataFrame` and display all the columns, you can proceed as follows, # In[5]: shot_df.head(4) # Have in mind that this is a larger `DataFrame`, if you do not use **.head(4)** it will print all the rows in the `DataFrame`. # In[ ]: # To force **pandas** to display all the columns, you can proceed as follows (we only display the first 4 rows), # In[6]: pd.set_option('display.max_columns', None) shot_df.head(4) #shot_df # In[ ]: # To erase a column in a `DataFrame` we can proceed as follows. # # Notice that we want to erase the column with the name `Unnamed: 0`, hence the notation **('Unnamed: 0', 1)**. # In[7]: shot_df1 = shot_df.drop('Unnamed: 0', 1) # In[ ]: # To display the information in `shot_df1` (we only display the first 2 rows), # In[8]: shot_df1.head(2) # As you can see, we erased the column **('Unnamed: 0', 1)**. # In[ ]: # Alternatively we can use the column 0 as the row labels of the DataFrame. # In[9]: shot_df2 = pd.read_csv(filepath_or_buffer='test.csv',index_col=0) # In[ ]: # To display the information in `shot_df2` (we only display the first 2 rows), # In[10]: shot_df2.head(2) # In[ ]: # We can do indexing and slicing in a `DataFrame`. # # To display all the columns belonging to rows 0 to 2 of the `DataFrame shot_df2`, # In[11]: shot_df2.iloc[0:2,:] # In[ ]: # or we can display columns 10 to 13 of the first five rows, # In[75]: shot_df2.iloc[0:5,10:13] # In[ ]: # To load a json file using the module **json** (the hard way), # In[12]: #import json #import pprint #from pprint import pprint #with open('data_json.json') as data_file: # data_json = json.load(data_file) #pprint(data_json) # In[13]: #type(data_json) # In[14]: #data_json['resultSets'][0]['headers'] # In[15]: #data_json['resultSets'][0]['rowSet'] # In[16]: #for x in data_json: # print (x) # In[17]: #for x in data_json['resultSets'][x] # print (data_json['resultSets'][x]) # In[18]: #data_json #type(data_json['resultSets'][0]) #print (data_json['resultSets'][0]) # In[ ]: # In[ ]: # To load the json file using **pandas**, we proceed as follows, # In[19]: #pd_json=pd.read_json(path_or_buf='data_json.json') pd_json=pd.read_json(path_or_buf='data_json.json',typ='series') # In[ ]: # Remember, you can inspect the json file using **JSONView**, as in the previous tutorial. # To know the type of the json file we just loaded, # In[20]: type(pd_json) # Notice that it is a `series` and not a `DataFrame`. Later on we are going to see how to convert `series` to a `DataFrame`, for the moment this does not generate any problem. # In[ ]: # The next line will print the content of the json file. This is the same information you see when you use **JSONView**, but now we read in the hard way. # In[21]: pd_json # In[ ]: # These lines will print the information inside each block of the json file, they are commented as they print a lot information. # In[22]: #pd_json.parameters # In[23]: #pd_json.resource # In[24]: #pd_json.resultSets # In[ ]: # At this point, we can create the `DataFrame` using the json file we imported in **pandas** (pretty much as in the previous tutorial). # # First we need to grab the headers and shot chart data. # In[25]: # Grab the headers to be used as column headers for our DataFrame json_headers = pd_json['resultSets'][0]['headers'] # In[26]: # Grab the shot chart data json_shots = pd_json['resultSets'][0]['rowSet'] # In[ ]: # To display the content of `json_headers` # In[27]: json_headers # In[ ]: # To display the content of `json_shots`. This line is commented as it prints a lot information. # In[28]: #json_shots # In[ ]: # To create the `DataFrame` using `json_headers` and `json_shots`, # In[29]: json_to_pd = pd.DataFrame(json_shots, columns=json_headers) # In[ ]: # and to display the first 5 rows of `json_to_pd`, # In[30]: json_to_pd.head() # In[ ]: # To determine if `json_to_pd` is equal to `shot_df1` (we only display the first 5 rows), # In[31]: json_to_pd.head() == shot_df1.head() # Notice that the only difference is the column **GAME_ID**. When loading the csv, **pandas** erased the leading zeroes, the rest of the columns are equal. # # This can be fixed, but I will let you as an exercise. # In[ ]: # The `DataFrame` `shot_df1` contains the shot chart data of all the field goal attempts Lebron James took during the 2014-15 regular season. # # We are specifically interested in the data saved in the columns `LOC_X`, `LOC_Y` and `SHOT_MADE_FLAG`. The columns `LOC_X` and `LOC_Y` contain the coordinate values for each shot measured from the basket rim. The column `SHOT_MADE_FLAG` contains the outcome of the shot, 0 for missed and 1 for converted. # # To extract from the `DataFrame` `shot_df1` the converted shots or **shot_df1.SHOT_MADE_FLAG == 1**, we proceed as follows (we only display the first 2 rows), # # In[32]: shot_df1[shot_df1.SHOT_MADE_FLAG == 1].head(2) # In[ ]: # To save the previous output in the `DataFrame` `converted`, # In[33]: converted = shot_df1[shot_df1.SHOT_MADE_FLAG == 1] # In[ ]: # To get the dimensions of the `DataFrame` # In[34]: converted.shape # In[ ]: # To save the output of the missed shots in the `DataFrame` `missed`, # In[35]: missed = shot_df1[shot_df1.SHOT_MADE_FLAG == 0] # In[ ]: # Let's create a new `DataFrame` as follows, # In[36]: new_df = pd.DataFrame() # In[ ]: # and let's extract some information from the `DataFrame` `shot_df1` and put it on the `DataFrame` `new_df`. # # Notice that we can access the information in a `DataFrame` as # # `shot_df1.LOC_X` # # or # # `shot_df1['LOC_X']`. # In[37]: new_df['LOC_X'] = shot_df1.LOC_X new_df['LOC_Y'] = shot_df1['LOC_Y'] new_df['SHOT_MADE_FLAG'] = shot_df1['SHOT_MADE_FLAG'] new_df['SHOT_TYPE'] = shot_df1['SHOT_TYPE'] # In[ ]: # To display the information in the `DataFrame` `new_df` (we only display the first 5 rows), # In[38]: #pprint(new_df.head(4)) new_df.head() # In[ ]: # We can also extract information from a `DataFrame` using strings. From the `DataFrame` `new_df`, let's extract the `SHOT_TYPE` information as follows, # In[39]: three_pointer = new_df[new_df.SHOT_TYPE == '3PT Field Goal'] two_pointer = new_df[new_df.SHOT_TYPE == '2PT Field Goal'] # In[ ]: # To display the information contained in the `DataFrame` `three_pointer` and `two_pointer` (we only display the first 2 rows), # In[40]: three_pointer.head(2) # In[41]: two_pointer.head(2) # In[ ]: # To know the type of `two_pointer`, # In[42]: type(two_pointer) # In[ ]: # Notice that if we extract the column `two_pointer['LOC_X']` and save it in the object `tmp_object`, as follows, # In[43]: tmp_object = two_pointer['LOC_X'] # In[ ]: # The type of the object `tmp_object` is a **pandas** `series` instead of a `DataFrame`. # In[44]: type(tmp_object) # In[ ]: # To convert `tmp_object` to a `DataFrame`, we proceed as follows. # # Notice that the option `name` in `to_frame(name='LOC_X')`, corresponds to the name we want to assign to that column in the `DataFrame tmp_object1`. # In[45]: tmp_object1 = two_pointer['LOC_X'].to_frame(name='LOC_X') # In[ ]: # which has a type # In[46]: type(tmp_object1) # In[ ]: # To convert a pandas series or DataFrame to a numpy array, we proceed as follows, # In[47]: np_array0 = shot_df1.as_matrix(columns=None) np_array1 = two_pointer['LOC_X'].as_matrix(columns=None) # In[ ]: # which has a type # In[48]: type(np_array0) # In[49]: type(np_array1) # In[ ]: # We can also do mathematical operations between columns (and rows) of a `DataFrame`, for example (we only display the first 5 rows), # In[50]: (three_pointer['LOC_X']*100).head() #(three_pointer*0).head() # In[51]: ((three_pointer['LOC_X']-three_pointer.LOC_Y)/three_pointer.LOC_Y**2).head() # In[52]: (three_pointer['LOC_X']/three_pointer['LOC_X'].max()).head() # In[ ]: # To create a new `DataFrame` with the normalized values of `LOC_X` and `LOC_Y`, # In[53]: df_norm = pd.DataFrame() c1=three_pointer['LOC_X']/three_pointer['LOC_X'].max() c2=three_pointer['LOC_Y']/three_pointer['LOC_Y'].max() df_norm['c1'] = c1 df_norm['c2'] = c2 df_norm.head() # Notice that we only display the first 5 rows. # To generate a summary statistics of a `DataFrame`. # In[54]: shot_df1.describe() # Notice that it computes the statistics only of the columns with numerical values. # In[ ]: # To count the total number of 3 point shots attempted, # In[55]: three_pointer.count() # In[ ]: # To count the number of 3 point shots converted and only for the column **SHOT_MADE_FLAG**, # In[56]: three_pointer.SHOT_MADE_FLAG[three_pointer.SHOT_MADE_FLAG == 1].count() # In[ ]: # and to count the number of 3 point shots missed, # In[57]: three_pointer.SHOT_MADE_FLAG[three_pointer.SHOT_MADE_FLAG == 0].count() # In[ ]: # In[ ]: # At this point, let's do some basic plotting using **matplotlib** [2]. # # The first plot will be the converted and missed shots using the `DataFrames` we just created, that is, we are going to plot the shot charts. # In[58]: sns.set_style("white") sns.set_color_codes() #shottrue = shot_df1[shot_df.SHOT_MADE_FLAG == 1] #shotfalse = shot_df1[shot_df.SHOT_MADE_FLAG == 0] plt.figure(figsize=(12,11)) plt.scatter(converted.LOC_X, converted.LOC_Y, color='green',label='converted',s=20,marker='o',alpha=0.5) plt.scatter(missed.LOC_X, missed.LOC_Y, color='red',label='missed',s=20,marker='o',alpha=0.5) #plt.scatter(shottrue.LOC_X, shottrue.LOC_Y) #plt.scatter(shotfalse.LOC_X, shotfalse.LOC_Y) plt.legend() plt.grid() plt.xlim(-300,300) plt.ylim(-100,500) #plt.grid() plt.show() # **FYI, the values in LOC_X and LOC_Y are in inches** # In[ ]: # Let's do a plot of all the 3 point shots. # In[59]: sns.set_style("white") sns.set_color_codes() #shottrue = shot_df1[shot_df.SHOT_MADE_FLAG == 1] #shotfalse = shot_df1[shot_df.SHOT_MADE_FLAG == 0] plt.figure(figsize=(12,11)) plt.scatter(three_pointer.LOC_X, three_pointer.LOC_Y, color='green',label='3 point shots',s=20,marker='o',alpha=0.5) #plt.scatter(shottrue.LOC_X, shottrue.LOC_Y) #plt.scatter(shotfalse.LOC_X, shotfalse.LOC_Y) plt.legend() plt.xlim(-300,300) plt.ylim(-100,500) #plt.grid() plt.show() # In[ ]: # We can also plot all the 3 point shots by converted and missed, as follows, # In[60]: sns.set_style("white") sns.set_color_codes() #shottrue = shot_df1[shot_df.SHOT_MADE_FLAG == 1] #shotfalse = shot_df1[shot_df.SHOT_MADE_FLAG == 0] plt.figure(figsize=(12,11)) plt.scatter(three_pointer.LOC_X[three_pointer.SHOT_MADE_FLAG == 1], three_pointer.LOC_Y[three_pointer.SHOT_MADE_FLAG == 1], color='green',label='3 point shots converted',s=20,marker='o',alpha=0.5) plt.scatter(three_pointer.LOC_X[three_pointer.SHOT_MADE_FLAG == 0], three_pointer.LOC_Y[three_pointer.SHOT_MADE_FLAG == 0], color='red',label='3 point shots missed',s=20,marker='o',alpha=0.5) #plt.scatter(shottrue.LOC_X, shottrue.LOC_Y) #plt.scatter(shotfalse.LOC_X, shotfalse.LOC_Y) plt.legend() plt.xlim(-300,300) plt.ylim(-100,500) #plt.grid() plt.show() # In[ ]: # Finally, we can use **seaborn** [3] to do more advanced plots, # In[61]: # create our jointplot joint_shot_chart = sns.jointplot(shot_df1.LOC_X, shot_df1.LOC_Y, stat_func=None, kind='scatter', space=0.2, alpha=1, size=12, edgecolor='w', color='g').set_axis_labels("x location", "y location") plt.figure(figsize=(12,11)) # In[ ]: # ### In the next tutorial, we are going to work a little bit more on data analytics and advanced plotting. # In[ ]: # # References # # [1] http://pandas.pydata.org/ # # [2] http://matplotlib.org/ # # [3] http://stanford.edu/~mwaskom/software/seaborn/ # In[ ]: # In[ ]: # In[ ]: # In[62]: #import sys #print('Python version:', sys.version_info) #import IPython #print('IPython version:', IPython.__version__) #print('Requests version', requests.__version__) #print('Pandas version:', pd.__version__) #print('json version:', json.__version__) #import matplotlib #print('matplotlib version:', matplotlib.__version__) #print('seaborn version:', sns.__version__) # In[ ]: