import pandas as pd, numpy as np, plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
py.offline.init_notebook_mode(connected=True)
from plotly import tools
from plotly.graph_objs import *
%matplotlib inline
from math import floor
import networkx as nx
This notebook will focus on following types of plots using plotly
data = pd.read_csv("data/cleaned_movie.csv")
data["profit"] = data["revenue"] - data["budget"]
def extract_decade(x):
return str(floor(x/10)*10)+"s"
data["decade"] = data["year"].apply(extract_decade)
data.head(3).transpose()
0 | 1 | 2 | |
---|---|---|---|
budget | 237000000 | 300000000 | 245000000 |
genres | ['Action', 'Adventure', 'Fantasy', 'Science Fi... | ['Adventure', 'Fantasy', 'Action'] | ['Action', 'Adventure', 'Crime'] |
keywords | ['culture clash', 'future', 'space war', 'spac... | ['ocean', 'drug abuse', 'exotic island', 'east... | ['spy', 'based on novel', 'secret agent', 'seq... |
original_language | en | en | en |
overview | In the 22nd century, a paraplegic Marine is di... | Captain Barbossa, long believed to be dead, ha... | A cryptic message from Bond’s past sends him o... |
popularity | 150.438 | 139.083 | 107.377 |
production_companies | ['Ingenious Film Partners', 'Twentieth Century... | ['Walt Disney Pictures', 'Jerry Bruckheimer Fi... | ['Columbia Pictures', 'Danjaq', 'B24'] |
production_countries | ['United States of America', 'United Kingdom'] | ['United States of America'] | ['United Kingdom', 'United States of America'] |
release_date | 2009-12-10 | 2007-05-19 | 2015-10-26 |
revenue | 2787965087 | 961000000 | 880674609 |
runtime | 162 | 169 | 148 |
spoken_languages | ['English', 'Español'] | ['English'] | ['Français', 'English', 'Español', 'Italiano',... |
vote_average | 7.2 | 6.9 | 6.3 |
vote_count | 11800 | 4500 | 4466 |
title | Avatar | Pirates of the Caribbean: At World's End | Spectre |
cast | [{'cast_id': 242, 'character': 'Jake Sully', '... | [{'cast_id': 4, 'character': 'Captain Jack Spa... | [{'cast_id': 1, 'character': 'James Bond', 'cr... |
crew | [{'credit_id': '52fe48009251416c750aca23', 'de... | [{'credit_id': '52fe4232c3a36847f800b579', 'de... | [{'credit_id': '54805967c3a36829b5002c41', 'de... |
actor1 | Sam Worthington | Johnny Depp | Daniel Craig |
actor2 | Zoe Saldana | Orlando Bloom | Christoph Waltz |
actor3 | Sigourney Weaver | Keira Knightley | Léa Seydoux |
director | James Cameron | Gore Verbinski | Sam Mendes |
year | 2009 | 2007 | 2015 |
month | 12 | 5 | 10 |
day | 10 | 19 | 26 |
dow | 3 | 5 | 0 |
profit | 2550965087 | 661000000 | 635674609 |
decade | 2000s | 2000s | 2010s |
df_by_vote = data.groupby(['decade']).vote_count.sum().reset_index()[4:]
bar_data = [go.Bar(x=df_by_vote['decade']\
, y=df_by_vote["vote_count"])]
py.offline.iplot({ 'data': bar_data,
'layout': {
'title': 'Vote Count for each decade',
'xaxis': {
'title': 'Decade'},
'yaxis': {
'title': 'Total Votes'}
}})
da = []
for i in range(0,len(pd.unique(data['decade']))):
trace = {
"type": 'violin',
"x": data['decade'][data['decade'] == pd.unique(data['decade'])[i]],
"y": data['vote_average'][data['decade'] == pd.unique(data['decade'])[i]],
"name": pd.unique(data['decade'])[i],
"box": {
"visible": True
},
"meanline": {
"visible": True
}
}
da.append(trace)
fig = {
"data": da,
"layout" : {
"title": "Average Movie Ratings by Decade",
"xaxis" : dict(title = 'Decade', autotick=False, showticklabels=True),
"yaxis" : dict(title = 'Average Rating')
}
}
iplot(fig, validate = False)
layout = go.Layout(
title='Profit vs. Budget',
xaxis=dict(
title='Budget',
gridcolor='rgb(255, 255, 255)',
range=[0, 4e8],
zerolinewidth=1,
ticklen=5,
gridwidth=2,
),
yaxis=dict(
title='Profit',
gridcolor='rgb(255, 255, 255)',
zerolinewidth=1,
ticklen=5,
gridwidth=2,
)
)
data2 = [go.Scatter(
x=data[data.budget>5000].budget.values, # Budget
y=data[data.budget>5000].profit.values, # Gross
mode='markers',
text= data[data.budget>5000].title.values, # Movie Titles
marker=dict(
size=3*(data[data.budget>5000].vote_average),
sizeref=1.0,
color=data[data.budget>5000].vote_average.values,
colorbar=ColorBar(title='Average Rating<br> ' , tickvals=[0,1.5,3,5,7,8.5]),
showscale=True,
colorscale='Viridis'
))]
fig = go.Figure(data=data2, layout=layout)
iplot(fig)
df_appearance = data[['actor1', 'year']].groupby('actor1').count().reset_index()
most_prolific = list(df_appearance['actor1'][np.array(df_appearance['year'] > 3)])
subset1 = data[data.actor1.isin(most_prolific) &\
data.actor2.isin(most_prolific) & data.actor3.isin(most_prolific)].reset_index(drop = True)
pair = []
for i in range(subset1.shape[0]):
pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor2"]))
pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor3"]))
pair.append((subset1.loc[i,"actor2"],subset1.loc[i,"actor3"]))
G=nx.Graph()
G.add_edges_from(pair)
nodes = list(G.node)
d = dict(nx.degree(G))
imp_actors = sorted(d, key=d.get)[201:]
pos=nx.kamada_kawai_layout(G)
Xv=[pos[k][0] for k in nodes]
Yv=[pos[k][1] for k in nodes]
Xed=[]
Yed=[]
for edge in pair:
Xed+=[pos[edge[0]][0],pos[edge[1]][0], None]
Yed+=[pos[edge[0]][1],pos[edge[1]][1], None]
trace3=Scatter(x=Xed,
y=Yed,
mode='lines',
line=Line(color='rgb(210,210,210)', width=1),
hoverinfo='none'
)
trace4=Scatter(x=Xv,
y=Yv,
mode='markers',
name='net',
marker=Marker(symbol='dot',
size=10,
showscale=True,
colorscale = 'Viridis',
reversescale = False,
colorbar=ColorBar(title='Number of Connections<br> ', tickvals=[2,4,6,8,10,12,14,16,18,20,22,24]),
color= list(d.values()),
line=Line(color='rgb(50,50,50)', width=0.5)
),
text=nodes,
hoverinfo='text'
)
layout=Layout(title= "Leading Actors and their Connections",
font= Font(size=12),
showlegend=False,
xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False),
margin=Margin(
l=40,
r=40,
b=85,
t=100,
),
)
data1=Data([trace3, trace4])
fig1=Figure(data=data1, layout=layout)
iplot(fig1)
imp_actors = sorted(d, key=d.get)[201:]
features = []
for item in imp_actors:
group = data.loc[(data['actor1'] == item) | (data['actor2']== item) ]
features.append((item, group.profit.mean(), group.vote_average.mean(), group.popularity.mean(), d[item]))
top15 = pd.DataFrame(features)
top15.columns = ["Actor","Avg. Profit", "Avg. Vote", "Avg. Popularity", "Connections with Other Important Actors)"]
top15
Actor | Avg. Profit | Avg. Vote | Avg. Popularity | Connections with Other Important Actors) | |
---|---|---|---|---|---|
0 | Scarlett Johansson | 2.916515e+07 | 6.437500 | 33.564579 | 13 |
1 | Russell Crowe | 7.690823e+07 | 6.600000 | 33.467027 | 13 |
2 | Cameron Diaz | 1.206145e+08 | 5.994118 | 38.642755 | 14 |
3 | Tom Cruise | 2.369365e+08 | 6.646154 | 48.156016 | 14 |
4 | Brad Pitt | 1.353495e+08 | 6.813793 | 48.674368 | 14 |
5 | Matt Damon | 9.884363e+07 | 6.636364 | 36.726481 | 14 |
6 | Meryl Streep | 8.384653e+07 | 6.482609 | 25.091077 | 14 |
7 | Christian Bale | 1.388917e+08 | 6.935000 | 55.938757 | 15 |
8 | Jude Law | 5.998734e+07 | 6.371429 | 29.928448 | 15 |
9 | Kate Winslet | 1.448356e+08 | 7.007143 | 30.104512 | 15 |
10 | Cate Blanchett | 1.037595e+08 | 6.562500 | 31.213836 | 16 |
11 | Julianne Moore | 3.168168e+07 | 6.461111 | 25.764328 | 16 |
12 | Julia Roberts | 9.624590e+07 | 6.200000 | 22.245915 | 18 |
13 | Morgan Freeman | 7.366736e+07 | 6.447826 | 32.025440 | 21 |
14 | Robert De Niro | 4.423925e+07 | 6.478049 | 24.653944 | 24 |
top15['Actor1'] = top15['Actor']+ " ("+ top15['Connections with Other Important Actors)'].astype('str') + ")"
trace1 = go.Bar( y=top15["Actor1"],
orientation = 'h',
x=top15["Avg. Profit"]/1000000,
name='Profit',
marker=dict(
color='rgb(161,215,106)'
))
trace2 = go.Bar(y=top15["Actor1"],
orientation = 'h',
x=top15["Avg. Popularity"],
marker=dict(
color='rgb(37,52,148)'
#color='rgb(65,182,196)'
),
name='Popularity')
trace3 = go.Bar(y=top15["Actor1"],
orientation = 'h',
x=top15["Avg. Vote"]*10,
name='Vote',
marker=dict(
#color='rgb(37,52,148)'
color='rgb(65,182,196)'
)
)
data = [trace1, trace2, trace3]
updatemenus = list([
dict(active=-1,
x=-0.3,
buttons=list([
dict(
label = 'Average Profit (in Millions)',
method = 'update',
args = [{'visible': [True, False, False]},
{'title': 'Average Profit'}]),
dict(
label = 'Average Popularity',
method = 'update',
args = [{'visible': [False, True, False]},
{'title': 'Average Popularity'}]),
dict(
label = 'Average Vote',
method = 'update',
args = [{'visible': [False, False, True]},
{'title': 'Average Vote'}])
]),
)
])
layout = dict(title='Average Measures for Important Actors (Select from Dropdown)', showlegend=False,
updatemenus=updatemenus)
fig = dict(data=data, layout=layout)
iplot(fig)
df = pd.read_csv('2017.csv')
codes = pd.read_csv("2014_world_gdp_with_codes.csv")
codes.columns = ["Country","Gdp","Code"]
df = df.merge(codes,how = "left", on = ["Country"] )
df['happiness score 2015'] = pd.read_csv("2015.csv")["Happiness Score"]
lat = pd.read_csv("lat.csv")
lat.columns = ["code","latitude","longitude","Country"]
df = df.merge(lat[["latitude","longitude","Country"]], on = ["Country"], how = "left")
data = [ dict(
type = 'choropleth',
locations = df['Code'],
z = df['Happiness.Score'],
text = df['Country'],
colorscale = [[2.7,"rgb(5, 10, 172)"],[3.6,"rgb(40, 60, 190)"],[4.5,"rgb(70, 100, 245)"],\
[5.4,"rgb(90, 120, 245)"],[6.3,"rgb(106, 137, 247)"],[7.2,"rgb(220, 220, 220)"]],
autocolorscale = False,
reversescale = True,
marker = dict(
line = dict (
color = 'rgb(180,180,180)',
width = 0.5 ) ),
colorbar = dict(
autotick = False,
title = 'Happiness Score'),
) ]
layout = dict(
title = 'Happiness Score',
geo = dict(
showframe = False,
showcoastlines = False,
projection = dict(type = 'Mercator')
)
)
fig = dict( data=data, layout=layout )
iplot(fig, validate = False)
df['text'] = df['Country'] + '<br>GDP ' + (df['Gdp']).astype(str)+' billion'
limits = [(0,30),(31,60),(61,90),(91,120),(121,160)]
colors = ["blue","green","yellow","rgb(255,65,54)","rgb(133,20,75)"]
countries = []
scale = 10
for i in range(len(limits)):
lim = limits[i]
df_sub = df[lim[0]:lim[1]]
city = dict(
type = 'scattergeo',
locationmode = 'World-Map',
lon = df_sub['longitude'],
lat = df_sub['latitude'],
text = df_sub['text'],
marker = dict(
size = df_sub['Gdp']/scale,
color = colors[i],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area',
title = 'Happiness Rank'
),
name = '{0} - {1}'.format(lim[0],lim[1]) )
countries.append(city)
layout = dict(
title = '2017 World Happiness Rank (Size Proportional to GDP) <br>(Click legend to toggle traces)',
showlegend = True,
geo = dict(
scope=None,
projection=dict( type='Mercator' ),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)"
),
)
fig = dict( data=countries, layout=layout )
iplot(fig, validate = False)