In [1]:

import pandas as pd
import matplotlib.pyplot as plt
player_data = pd.read_csv('/Users/zoeolson1/player_data.csv')
player_data.head(2)

/Users/zoeolson1/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Out[1]:

	id	dob	name	nationality	height	weight	wins	fouls	pl_goals	losses	...	last_man_tackle	clearance_off_line	saves	punches	goal_kicks	penalty_save	keeper_throws	good_high_claim	total_keeper_sweeper	stand_catch_dive_catch
0	3452	11/26/90	Danny Welbeck	England	185cm	73kg	87	89	34	35	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	5001	12/1/94	Emre Can	Germany	184cm	82kg	29	68	2	16	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

2 rows × 56 columns

We are going to look at how players score goals, and weather goals are more frequently made with the left foot, right foot, or head. First, we will clean our data by replacing all of the NaN values with '0', because this is how they appear in the original CSV. Below we have a list of all of the head, left foot, and right foot goals of each player.

In [2]:

df = pd.DataFrame(player_data, columns = ['att_hd_goal', 'att_lf_goal', 'att_rf_goal'])
df2 = df.fillna(0)
df2.head(2)

Out[2]:

	att_hd_goal	att_lf_goal	att_rf_goal
0	9.0	7.0	18.0
1	0.0	0.0	2.0

Now, we will format the dataframe to make it suitable for a pie chart. We will do so by aggregating each column to get the total head, right foot, and left foot goals. Then creating leables for each of these categories.

In [3]:

df2.columns = ['header', 'left-foot', 'right-foot']
df2.loc['Total']= df2.sum()
df2

Out[3]:

	header	left-foot	right-foot
0	9.0	7.0	18.0
1	0.0	0.0	2.0
2	3.0	6.0	1.0
3	0.0	0.0	0.0
4	0.0	0.0	0.0
5	0.0	0.0	0.0
6	1.0	2.0	5.0
7	0.0	0.0	0.0
8	0.0	0.0	0.0
9	1.0	5.0	1.0
10	0.0	0.0	0.0
11	0.0	0.0	0.0
12	0.0	0.0	0.0
13	0.0	0.0	0.0
14	0.0	0.0	0.0
15	3.0	0.0	1.0
16	1.0	6.0	10.0
17	3.0	2.0	13.0
18	7.0	2.0	13.0
19	0.0	19.0	5.0
20	4.0	4.0	20.0
21	0.0	1.0	0.0
22	1.0	0.0	0.0
23	0.0	0.0	0.0
24	1.0	0.0	3.0
25	1.0	0.0	6.0
26	0.0	0.0	0.0
27	2.0	3.0	1.0
28	0.0	0.0	0.0
29	0.0	1.0	0.0
...	...	...	...
538	0.0	0.0	0.0
539	3.0	0.0	13.0
540	0.0	0.0	0.0
541	1.0	18.0	81.0
542	0.0	0.0	0.0
543	2.0	12.0	3.0
544	0.0	0.0	0.0
545	0.0	0.0	0.0
546	0.0	2.0	14.0
547	0.0	2.0	2.0
548	0.0	0.0	0.0
549	0.0	0.0	0.0
550	0.0	0.0	0.0
551	0.0	0.0	0.0
552	1.0	0.0	1.0
553	0.0	3.0	1.0
554	0.0	0.0	1.0
555	0.0	0.0	0.0
556	0.0	0.0	0.0
557	0.0	0.0	0.0
558	0.0	0.0	1.0
559	0.0	0.0	0.0
560	2.0	2.0	17.0
561	0.0	0.0	0.0
562	3.0	1.0	1.0
563	0.0	0.0	0.0
564	0.0	1.0	1.0
565	3.0	5.0	5.0
566	0.0	0.0	0.0
Total	655.0	1048.0	2067.0

568 rows × 3 columns

Great, we have a total row. Now lets remove all of the data except for the column headers and totals.

In [4]:

goal_types = df2.T

In [5]:

goal_types.head(3)

Out[5]:

	0	1	2	6	9	...	558	560	562	564	565	Total
header	9.0	0.0	3.0	1.0	1.0	...	0.0	2.0	3.0	0.0	3.0	655.0
left-foot	7.0	0.0	6.0	2.0	5.0	...	0.0	2.0	1.0	1.0	5.0	1048.0
right-foot	18.0	2.0	1.0	5.0	1.0	...	1.0	17.0	1.0	1.0	5.0	2067.0

3 rows × 568 columns

In [6]:

goal_types = pd.DataFrame(goal_types['Total'])

goal_types.head(3)

Out[6]:

	Total
header	655.0
left-foot	1048.0
right-foot	2067.0

Great, now our data frame is ready for a pie chart display!
For our pie chart, we will choose colors using iWantHue ( http://tools.medialab.sciences-po.fr/iwanthue/ ).

In [7]:

colors = ["#e49300", "#0263f3", "#dbff62", "#ff004e","#a14400"]

plt.pie(goal_types['Total'], labels = goal_types.index, shadow= False, colors = colors, explode = (0, 0, 0), startangle = 90, autopct='%1.1f%%',)

plt.axis('equal')

plt.tight_layout()
plt.show()

Yay! We have our pie chart. Now lets look at what other factors go into goal scoring..

In [8]:

ng = pd.DataFrame(player_data, columns = ['nationality', 'id', 'pl_goals', 'appearances', 'blocked_scoring_att'])
nat_goals = ng.fillna(0)
nat_goals.head(3)

Out[8]:

	nationality	id	pl_goals	appearances	blocked_scoring_att
0	England	3452	34	154	84.0
1	Germany	5001	2	60	13.0
2	Spain	4805	10	39	13.0

In [20]:

nat_goals2 = (nat_goals.groupby(['nationality'],as_index=False).id.count())
nat_goals2.head()

Out[20]:

	nationality	id
0	0	1
1	Algeria	4
2	Argentina	16
3	Armenia	1
4	Australia	5

In [21]:

nat_goals3 = (nat_goals.groupby(['nationality'],as_index=False).pl_goals.sum())
nat_goals3.head()

Out[21]:

	nationality	pl_goals
0	0	0
1	Algeria	25
2	Argentina	152
3	Armenia	0
4	Australia	0

In [24]:

merged_goals = pd.merge(left=nat_goals2,right=nat_goals3, left_on='nationality', right_on='nationality')
merged_goals

Out[24]:

	nationality	id	pl_goals
0	0	1	0
1	Algeria	4	25
2	Argentina	16	152
3	Armenia	1	0
4	Australia	5	0
5	Austria	7	23
6	Belgium	23	289
7	Bosnia And Herzegovina	2	0
8	Brazil	12	96
9	Cameroon	2	0
10	Canada	1	2
11	Chile	2	33
12	Colombia	3	1
13	Congo	3	20
14	Costa Rica	1	2
15	Cote D'Ivoire	7	107
16	Croatia	1	3
17	Curacao	1	1
18	Czech Republic	1	0
19	Denmark	5	23
20	Ecuador	4	29
21	Egypt	3	5
22	England	207	1585
23	Equatorial Guinea	1	0
24	Estonia	1	0
25	France	30	208
26	Gabon	1	0
27	Gambia	1	1
28	Germany	12	44
29	Ghana	3	16
...	...	...	...
33	Italy	6	17
34	Jamaica	2	7
35	Japan	3	7
36	Kenya	1	5
37	Lithuania	1	0
38	Mali	1	2
39	Morocco	2	0
40	Netherlands	18	45
41	New Zealand	1	4
42	Nigeria	9	66
43	Northern Ireland	7	47
44	Norway	4	9
45	Poland	6	1
46	Portugal	5	7
47	Romania	2	0
48	Scotland	17	133
49	Senegal	7	68
50	Serbia	5	47
51	Slovakia	1	0
52	South Africa	1	19
53	South Korea	3	28
54	Spain	41	243
55	Sweden	5	38
56	Switzerland	6	9
57	Tunisia	2	2
58	Turkey	1	0
59	United States	6	2
60	Uruguay	5	15
61	Venezuela	1	11
62	Wales	13	70

63 rows × 3 columns

Excellent! Now lets look at nations with over 10 players.

In [35]:

final_data = merged_goals.loc[merged_goals['id'] >= 10]

In [36]:

import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
merged_goals.plot.bar()

Out[36]:

<matplotlib.axes._subplots.AxesSubplot at 0x11ab1af10>

In [40]:

import plotly.graph_objs as go
import plotly.plotly as py

py.sign_in('zoe1114', 'gqr5grvyef')

trace1 = go.Bar(
    x=final_data['nationality'],
    y=final_data['id'],
    name='Players',
    marker=dict(
        color='rgb(55, 83, 109)'
    )
)
trace2 = go.Bar(
    x=final_data['nationality'],
    y=final_data['pl_goals'],
    name='Goals',
    marker=dict(
        color='rgb(26, 118, 255)'
    )
)
data = [trace1, trace2]
layout = go.Layout(
    title='# of Players vs. Goals Made',
    xaxis=dict(
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Count',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')

High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~zoe1114/0 or inside your plot.ly account where it is named 'style-bar'

Out[40]:

In [13]:

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import pandas as pd

nd = pd.read_csv('/Users/zoeolson1/player_data3.csv')
nd.head(2)
nd['nationality'] = str(nd['nationality'])


words =' '.join(nd['nationality'])
print "amount of players for analysis: ", (len(words.split(",")))

cloud = WordCloud(font_path='System/Library/Fonts/Noteworthy.ttc', stopwords=STOPWORDS,
                      background_color='white',
                      width=500, height=500).generate(words)

plt.imshow(cloud)
plt.axis("off")
plt.show()
plt.close()

amount of players for analysis:  536

In [27]:

import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

py.sign_in('zoe1114', 'gqr5grvyef')

nd = pd.read_csv('/Users/zoeolson1/player_data3.csv', parse_dates = True)
nd['dob'] = pd.to_datetime(nd['dob'])
nd['year2'] = nd['dob'].dt.year

layout = go.Layout(title='Year Born vs. Number of Appearances')

# Create traces
trace0 = go.Scatter(
    x = nd['year2'],
    y = nd['appearances'],
    mode = 'lines+markers',
    name = 'lines+markers'
)
#trace1 = go.Scatter(
 #   x = nd['dob'],
#    y = nd['wins'],
   # mode = 'lines+markers',
#    name = 'lines+markers'
#)
#trace2 = go.Scatter(
#    x = random_x,
#    y = random_y2,
#    mode = 'markers',
#    name = 'markers'
#)
data = [trace0]

# Plot and embed in ipython notebook!
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='line-mode')

Out[27]:

In [87]:

import plotly.plotly as py
import plotly.graph_objs as go

import pandas as pd


df = pd.read_csv('/Users/zoeolson1/player_data3.csv')

dt = pd.DataFrame({'count' : df.groupby(['nationality', 'Latitude', 'Longitude'])['id'].count()}).reset_index()

dt['text'] = dt['nationality'] + '<br>Count ' + (dt['count']).astype(str)
limits = [(0,1),(2,6),(7,12),(13,17),(18,24),(25,30)]
colors = ["rgb(0,116,217)","rgb(255,65,54)","rgb(133,20,75)","rgb(255,133,27)","rgb(0,255, 255)", "rgb(255,255,51)"]
cities = []
                                    
                                      
for i in range(len(limits)):
    lim = limits[i]
    df_sub = dt[lim[0]:lim[1]]
    city = dict(
        type = 'scattergeo',
        locationmode = 'Africa',
        lon = df_sub['Latitude'],
        lat = df_sub['Longitude'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['count']*20,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    cities.append(city)
                                      
layout = dict(
        title = '2014 English Primer League Player Orgins<br>(Click legend to toggle traces)',
        showlegend = True,
        geo = dict(
            scope='Europe',
            projection=dict( type='Mercator' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict( data=cities, layout=layout )
py.iplot( fig, validate=False, filename='d3-bubble-map-populations' )

Out[87]:

Please view using NBVIEWER through this link: http://nbviewer.jupyter.org/github/kylemh/FPL-DataVisualization/blob/master/jupyter_viz.ipynb

In [ ]: