import pandas as pd
import matplotlib.pyplot as plt
player_data = pd.read_csv('/Users/zoeolson1/player_data.csv')
player_data.head(2)
/Users/zoeolson1/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment. warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
id | dob | name | nationality | height | weight | wins | fouls | pl_goals | losses | ... | last_man_tackle | clearance_off_line | saves | punches | goal_kicks | penalty_save | keeper_throws | good_high_claim | total_keeper_sweeper | stand_catch_dive_catch | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3452 | 11/26/90 | Danny Welbeck | England | 185cm | 73kg | 87 | 89 | 34 | 35 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 5001 | 12/1/94 | Emre Can | Germany | 184cm | 82kg | 29 | 68 | 2 | 16 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 rows × 56 columns
We are going to look at how players score goals, and weather goals are more frequently made with the left foot, right foot, or head. First, we will clean our data by replacing all of the NaN values with '0', because this is how they appear in the original CSV. Below we have a list of all of the head, left foot, and right foot goals of each player.
df = pd.DataFrame(player_data, columns = ['att_hd_goal', 'att_lf_goal', 'att_rf_goal'])
df2 = df.fillna(0)
df2.head(2)
att_hd_goal | att_lf_goal | att_rf_goal | |
---|---|---|---|
0 | 9.0 | 7.0 | 18.0 |
1 | 0.0 | 0.0 | 2.0 |
Now, we will format the dataframe to make it suitable for a pie chart. We will do so by aggregating each column to get the total head, right foot, and left foot goals. Then creating leables for each of these categories.
df2.columns = ['header', 'left-foot', 'right-foot']
df2.loc['Total']= df2.sum()
df2
header | left-foot | right-foot | |
---|---|---|---|
0 | 9.0 | 7.0 | 18.0 |
1 | 0.0 | 0.0 | 2.0 |
2 | 3.0 | 6.0 | 1.0 |
3 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 |
5 | 0.0 | 0.0 | 0.0 |
6 | 1.0 | 2.0 | 5.0 |
7 | 0.0 | 0.0 | 0.0 |
8 | 0.0 | 0.0 | 0.0 |
9 | 1.0 | 5.0 | 1.0 |
10 | 0.0 | 0.0 | 0.0 |
11 | 0.0 | 0.0 | 0.0 |
12 | 0.0 | 0.0 | 0.0 |
13 | 0.0 | 0.0 | 0.0 |
14 | 0.0 | 0.0 | 0.0 |
15 | 3.0 | 0.0 | 1.0 |
16 | 1.0 | 6.0 | 10.0 |
17 | 3.0 | 2.0 | 13.0 |
18 | 7.0 | 2.0 | 13.0 |
19 | 0.0 | 19.0 | 5.0 |
20 | 4.0 | 4.0 | 20.0 |
21 | 0.0 | 1.0 | 0.0 |
22 | 1.0 | 0.0 | 0.0 |
23 | 0.0 | 0.0 | 0.0 |
24 | 1.0 | 0.0 | 3.0 |
25 | 1.0 | 0.0 | 6.0 |
26 | 0.0 | 0.0 | 0.0 |
27 | 2.0 | 3.0 | 1.0 |
28 | 0.0 | 0.0 | 0.0 |
29 | 0.0 | 1.0 | 0.0 |
... | ... | ... | ... |
538 | 0.0 | 0.0 | 0.0 |
539 | 3.0 | 0.0 | 13.0 |
540 | 0.0 | 0.0 | 0.0 |
541 | 1.0 | 18.0 | 81.0 |
542 | 0.0 | 0.0 | 0.0 |
543 | 2.0 | 12.0 | 3.0 |
544 | 0.0 | 0.0 | 0.0 |
545 | 0.0 | 0.0 | 0.0 |
546 | 0.0 | 2.0 | 14.0 |
547 | 0.0 | 2.0 | 2.0 |
548 | 0.0 | 0.0 | 0.0 |
549 | 0.0 | 0.0 | 0.0 |
550 | 0.0 | 0.0 | 0.0 |
551 | 0.0 | 0.0 | 0.0 |
552 | 1.0 | 0.0 | 1.0 |
553 | 0.0 | 3.0 | 1.0 |
554 | 0.0 | 0.0 | 1.0 |
555 | 0.0 | 0.0 | 0.0 |
556 | 0.0 | 0.0 | 0.0 |
557 | 0.0 | 0.0 | 0.0 |
558 | 0.0 | 0.0 | 1.0 |
559 | 0.0 | 0.0 | 0.0 |
560 | 2.0 | 2.0 | 17.0 |
561 | 0.0 | 0.0 | 0.0 |
562 | 3.0 | 1.0 | 1.0 |
563 | 0.0 | 0.0 | 0.0 |
564 | 0.0 | 1.0 | 1.0 |
565 | 3.0 | 5.0 | 5.0 |
566 | 0.0 | 0.0 | 0.0 |
Total | 655.0 | 1048.0 | 2067.0 |
568 rows × 3 columns
Great, we have a total row. Now lets remove all of the data except for the column headers and totals.
goal_types = df2.T
goal_types.head(3)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
header | 9.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 2.0 | 0.0 | 3.0 | 0.0 | 0.0 | 3.0 | 0.0 | 655.0 |
left-foot | 7.0 | 0.0 | 6.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 5.0 | ... | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 0.0 | 1.0 | 5.0 | 0.0 | 1048.0 |
right-foot | 18.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 17.0 | 0.0 | 1.0 | 0.0 | 1.0 | 5.0 | 0.0 | 2067.0 |
3 rows × 568 columns
goal_types = pd.DataFrame(goal_types['Total'])
goal_types.head(3)
Total | |
---|---|
header | 655.0 |
left-foot | 1048.0 |
right-foot | 2067.0 |
Great, now our data frame is ready for a pie chart display!
For our pie chart, we will choose colors using iWantHue ( http://tools.medialab.sciences-po.fr/iwanthue/ ).
colors = ["#e49300", "#0263f3", "#dbff62", "#ff004e","#a14400"]
plt.pie(goal_types['Total'], labels = goal_types.index, shadow= False, colors = colors, explode = (0, 0, 0), startangle = 90, autopct='%1.1f%%',)
plt.axis('equal')
plt.tight_layout()
plt.show()
Yay! We have our pie chart. Now lets look at what other factors go into goal scoring..
ng = pd.DataFrame(player_data, columns = ['nationality', 'id', 'pl_goals', 'appearances', 'blocked_scoring_att'])
nat_goals = ng.fillna(0)
nat_goals.head(3)
nationality | id | pl_goals | appearances | blocked_scoring_att | |
---|---|---|---|---|---|
0 | England | 3452 | 34 | 154 | 84.0 |
1 | Germany | 5001 | 2 | 60 | 13.0 |
2 | Spain | 4805 | 10 | 39 | 13.0 |
nat_goals2 = (nat_goals.groupby(['nationality'],as_index=False).id.count())
nat_goals2.head()
nationality | id | |
---|---|---|
0 | 0 | 1 |
1 | Algeria | 4 |
2 | Argentina | 16 |
3 | Armenia | 1 |
4 | Australia | 5 |
nat_goals3 = (nat_goals.groupby(['nationality'],as_index=False).pl_goals.sum())
nat_goals3.head()
nationality | pl_goals | |
---|---|---|
0 | 0 | 0 |
1 | Algeria | 25 |
2 | Argentina | 152 |
3 | Armenia | 0 |
4 | Australia | 0 |
merged_goals = pd.merge(left=nat_goals2,right=nat_goals3, left_on='nationality', right_on='nationality')
merged_goals
nationality | id | pl_goals | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | Algeria | 4 | 25 |
2 | Argentina | 16 | 152 |
3 | Armenia | 1 | 0 |
4 | Australia | 5 | 0 |
5 | Austria | 7 | 23 |
6 | Belgium | 23 | 289 |
7 | Bosnia And Herzegovina | 2 | 0 |
8 | Brazil | 12 | 96 |
9 | Cameroon | 2 | 0 |
10 | Canada | 1 | 2 |
11 | Chile | 2 | 33 |
12 | Colombia | 3 | 1 |
13 | Congo | 3 | 20 |
14 | Costa Rica | 1 | 2 |
15 | Cote D'Ivoire | 7 | 107 |
16 | Croatia | 1 | 3 |
17 | Curacao | 1 | 1 |
18 | Czech Republic | 1 | 0 |
19 | Denmark | 5 | 23 |
20 | Ecuador | 4 | 29 |
21 | Egypt | 3 | 5 |
22 | England | 207 | 1585 |
23 | Equatorial Guinea | 1 | 0 |
24 | Estonia | 1 | 0 |
25 | France | 30 | 208 |
26 | Gabon | 1 | 0 |
27 | Gambia | 1 | 1 |
28 | Germany | 12 | 44 |
29 | Ghana | 3 | 16 |
... | ... | ... | ... |
33 | Italy | 6 | 17 |
34 | Jamaica | 2 | 7 |
35 | Japan | 3 | 7 |
36 | Kenya | 1 | 5 |
37 | Lithuania | 1 | 0 |
38 | Mali | 1 | 2 |
39 | Morocco | 2 | 0 |
40 | Netherlands | 18 | 45 |
41 | New Zealand | 1 | 4 |
42 | Nigeria | 9 | 66 |
43 | Northern Ireland | 7 | 47 |
44 | Norway | 4 | 9 |
45 | Poland | 6 | 1 |
46 | Portugal | 5 | 7 |
47 | Romania | 2 | 0 |
48 | Scotland | 17 | 133 |
49 | Senegal | 7 | 68 |
50 | Serbia | 5 | 47 |
51 | Slovakia | 1 | 0 |
52 | South Africa | 1 | 19 |
53 | South Korea | 3 | 28 |
54 | Spain | 41 | 243 |
55 | Sweden | 5 | 38 |
56 | Switzerland | 6 | 9 |
57 | Tunisia | 2 | 2 |
58 | Turkey | 1 | 0 |
59 | United States | 6 | 2 |
60 | Uruguay | 5 | 15 |
61 | Venezuela | 1 | 11 |
62 | Wales | 13 | 70 |
63 rows × 3 columns
Excellent! Now lets look at nations with over 10 players.
final_data = merged_goals.loc[merged_goals['id'] >= 10]
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
merged_goals.plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x11ab1af10>
import plotly.graph_objs as go
import plotly.plotly as py
py.sign_in('zoe1114', 'gqr5grvyef')
trace1 = go.Bar(
x=final_data['nationality'],
y=final_data['id'],
name='Players',
marker=dict(
color='rgb(55, 83, 109)'
)
)
trace2 = go.Bar(
x=final_data['nationality'],
y=final_data['pl_goals'],
name='Goals',
marker=dict(
color='rgb(26, 118, 255)'
)
)
data = [trace1, trace2]
layout = go.Layout(
title='# of Players vs. Goals Made',
xaxis=dict(
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
legend=dict(
x=0,
y=1.0,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
),
barmode='group',
bargap=0.15,
bargroupgap=0.1
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')
High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~zoe1114/0 or inside your plot.ly account where it is named 'style-bar'
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
nd = pd.read_csv('/Users/zoeolson1/player_data3.csv')
nd.head(2)
nd['nationality'] = str(nd['nationality'])
words =' '.join(nd['nationality'])
print "amount of players for analysis: ", (len(words.split(",")))
cloud = WordCloud(font_path='System/Library/Fonts/Noteworthy.ttc', stopwords=STOPWORDS,
background_color='white',
width=500, height=500).generate(words)
plt.imshow(cloud)
plt.axis("off")
plt.show()
plt.close()
amount of players for analysis: 536
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
py.sign_in('zoe1114', 'gqr5grvyef')
nd = pd.read_csv('/Users/zoeolson1/player_data3.csv', parse_dates = True)
nd['dob'] = pd.to_datetime(nd['dob'])
nd['year2'] = nd['dob'].dt.year
layout = go.Layout(title='Year Born vs. Number of Appearances')
# Create traces
trace0 = go.Scatter(
x = nd['year2'],
y = nd['appearances'],
mode = 'lines+markers',
name = 'lines+markers'
)
#trace1 = go.Scatter(
# x = nd['dob'],
# y = nd['wins'],
# mode = 'lines+markers',
# name = 'lines+markers'
#)
#trace2 = go.Scatter(
# x = random_x,
# y = random_y2,
# mode = 'markers',
# name = 'markers'
#)
data = [trace0]
# Plot and embed in ipython notebook!
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='line-mode')
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
df = pd.read_csv('/Users/zoeolson1/player_data3.csv')
dt = pd.DataFrame({'count' : df.groupby(['nationality', 'Latitude', 'Longitude'])['id'].count()}).reset_index()
dt['text'] = dt['nationality'] + '<br>Count ' + (dt['count']).astype(str)
limits = [(0,1),(2,6),(7,12),(13,17),(18,24),(25,30)]
colors = ["rgb(0,116,217)","rgb(255,65,54)","rgb(133,20,75)","rgb(255,133,27)","rgb(0,255, 255)", "rgb(255,255,51)"]
cities = []
for i in range(len(limits)):
lim = limits[i]
df_sub = dt[lim[0]:lim[1]]
city = dict(
type = 'scattergeo',
locationmode = 'Africa',
lon = df_sub['Latitude'],
lat = df_sub['Longitude'],
text = df_sub['text'],
marker = dict(
size = df_sub['count']*20,
color = colors[i],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area'
),
name = '{0} - {1}'.format(lim[0],lim[1]) )
cities.append(city)
layout = dict(
title = '2014 English Primer League Player Orgins<br>(Click legend to toggle traces)',
showlegend = True,
geo = dict(
scope='Europe',
projection=dict( type='Mercator' ),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)"
),
)
fig = dict( data=cities, layout=layout )
py.iplot( fig, validate=False, filename='d3-bubble-map-populations' )
Please view using NBVIEWER through this link: http://nbviewer.jupyter.org/github/kylemh/FPL-DataVisualization/blob/master/jupyter_viz.ipynb