In [26]:

import pandas as pd, numpy as np, plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
py.offline.init_notebook_mode(connected=True)
from plotly import tools
from plotly.graph_objs import *
%matplotlib inline
from math import floor
import networkx as nx

This notebook will focus on following types of plots using plotly

Bubble Charts
Bar Charts
Network & Drop-down Menus (This is the hardest part & plotly's documentation is not enough)
Geo-Visualizations (Cloropleths & Symbol Plots)

In [13]:

data = pd.read_csv("data/cleaned_movie.csv")
data["profit"] = data["revenue"] - data["budget"]

In [14]:

def extract_decade(x):
    return str(floor(x/10)*10)+"s"
data["decade"] = data["year"].apply(extract_decade)

In [15]:

data.head(3).transpose()

Out[15]:

	0	1	2
budget	237000000	300000000	245000000
genres	['Action', 'Adventure', 'Fantasy', 'Science Fi...	['Adventure', 'Fantasy', 'Action']	['Action', 'Adventure', 'Crime']
keywords	['culture clash', 'future', 'space war', 'spac...	['ocean', 'drug abuse', 'exotic island', 'east...	['spy', 'based on novel', 'secret agent', 'seq...
original_language	en	en	en
overview	In the 22nd century, a paraplegic Marine is di...	Captain Barbossa, long believed to be dead, ha...	A cryptic message from Bond’s past sends him o...
popularity	150.438	139.083	107.377
production_companies	['Ingenious Film Partners', 'Twentieth Century...	['Walt Disney Pictures', 'Jerry Bruckheimer Fi...	['Columbia Pictures', 'Danjaq', 'B24']
production_countries	['United States of America', 'United Kingdom']	['United States of America']	['United Kingdom', 'United States of America']
release_date	2009-12-10	2007-05-19	2015-10-26
revenue	2787965087	961000000	880674609
runtime	162	169	148
spoken_languages	['English', 'Español']	['English']	['Français', 'English', 'Español', 'Italiano',...
vote_average	7.2	6.9	6.3
vote_count	11800	4500	4466
title	Avatar	Pirates of the Caribbean: At World's End	Spectre
cast	[{'cast_id': 242, 'character': 'Jake Sully', '...	[{'cast_id': 4, 'character': 'Captain Jack Spa...	[{'cast_id': 1, 'character': 'James Bond', 'cr...
crew	[{'credit_id': '52fe48009251416c750aca23', 'de...	[{'credit_id': '52fe4232c3a36847f800b579', 'de...	[{'credit_id': '54805967c3a36829b5002c41', 'de...
actor1	Sam Worthington	Johnny Depp	Daniel Craig
actor2	Zoe Saldana	Orlando Bloom	Christoph Waltz
actor3	Sigourney Weaver	Keira Knightley	Léa Seydoux
director	James Cameron	Gore Verbinski	Sam Mendes
year	2009	2007	2015
month	12	5	10
day	10	19	26
dow	3	5	0
profit	2550965087	661000000	635674609
decade	2000s	2000s	2010s

Bar Plots - Vote Count for Each Decade¶

In [20]:

df_by_vote = data.groupby(['decade']).vote_count.sum().reset_index()[4:]

In [21]:

bar_data = [go.Bar(x=df_by_vote['decade']\
                   , y=df_by_vote["vote_count"])]

py.offline.iplot({ 'data': bar_data,
            'layout': {
               'title': 'Vote Count for each decade',
               'xaxis': {
                 'title': 'Decade'},
               'yaxis': {
                'title': 'Total Votes'}
        }})

Violin Plots - Movie Ratings by Decade¶

In [23]:

da = []
for i in range(0,len(pd.unique(data['decade']))):
    trace = {
            "type": 'violin',
            "x": data['decade'][data['decade'] == pd.unique(data['decade'])[i]],
            "y": data['vote_average'][data['decade'] == pd.unique(data['decade'])[i]],
            "name": pd.unique(data['decade'])[i],
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            }
        }
    da.append(trace)

        
fig = {
    "data": da,
    "layout" : {
        "title": "Average Movie Ratings by Decade",
            "xaxis" : dict(title = 'Decade', autotick=False, showticklabels=True),
            "yaxis" : dict(title = 'Average Rating')
    }
}

iplot(fig, validate = False)

Bubble Chart - Profit vs Budget (Colored by Rating)¶

In [25]:

layout = go.Layout(
    title='Profit vs. Budget',
    xaxis=dict(
        title='Budget',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 4e8],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='Profit',
        gridcolor='rgb(255, 255, 255)',
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    )
)
data2 = [go.Scatter(
    x=data[data.budget>5000].budget.values, # Budget
    y=data[data.budget>5000].profit.values,  # Gross
    mode='markers',
    text= data[data.budget>5000].title.values, # Movie Titles
    marker=dict(
        size=3*(data[data.budget>5000].vote_average),
        sizeref=1.0,
        color=data[data.budget>5000].vote_average.values,
                
        colorbar=ColorBar(title='Average Rating<br> &nbsp;' , tickvals=[0,1.5,3,5,7,8.5]),

        showscale=True,
        colorscale='Viridis'
    ))]
fig = go.Figure(data=data2, layout=layout)
iplot(fig)

Network Visualization¶

Subsetting the data to get important actors¶

In [27]:

df_appearance = data[['actor1', 'year']].groupby('actor1').count().reset_index()

most_prolific = list(df_appearance['actor1'][np.array(df_appearance['year'] > 3)])

subset1 = data[data.actor1.isin(most_prolific) &\
            data.actor2.isin(most_prolific) & data.actor3.isin(most_prolific)].reset_index(drop = True)

Creating Network from Subset Data¶

In [38]:

pair = []
for i in range(subset1.shape[0]):
    pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor2"]))
    pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor3"]))
    pair.append((subset1.loc[i,"actor2"],subset1.loc[i,"actor3"]))

In [39]:

G=nx.Graph()
G.add_edges_from(pair)
nodes = list(G.node)
d = dict(nx.degree(G))
imp_actors = sorted(d, key=d.get)[201:]

In [40]:

pos=nx.kamada_kawai_layout(G)  

In [41]:

Xv=[pos[k][0] for k in nodes]
Yv=[pos[k][1] for k in nodes]
Xed=[]
Yed=[]
for edge in pair:
    Xed+=[pos[edge[0]][0],pos[edge[1]][0], None]
    Yed+=[pos[edge[0]][1],pos[edge[1]][1], None] 
    
trace3=Scatter(x=Xed,
               y=Yed,
               mode='lines',
               line=Line(color='rgb(210,210,210)', width=1),
               hoverinfo='none'
               )
trace4=Scatter(x=Xv,
               y=Yv,
               mode='markers',
               name='net',
               marker=Marker(symbol='dot',
                             size=10, 
               showscale=True,
               colorscale = 'Viridis',
               reversescale = False,
                             
               colorbar=ColorBar(title='Number of Connections<br> &nbsp;', tickvals=[2,4,6,8,10,12,14,16,18,20,22,24]),

               color= list(d.values()),
               line=Line(color='rgb(50,50,50)', width=0.5)
               ),
               text=nodes,
               hoverinfo='text'
               )
    
    

layout=Layout(title= "Leading Actors and their Connections",  
    font= Font(size=12),
    showlegend=False, 
    xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False),
    margin=Margin(
        l=40,
        r=40,
        b=85,
        t=100,
    ),         
    )

data1=Data([trace3, trace4])
fig1=Figure(data=data1, layout=layout)
iplot(fig1)

In [42]:

imp_actors = sorted(d, key=d.get)[201:]

In [44]:

features = []
for item in imp_actors:
    group = data.loc[(data['actor1'] == item) | (data['actor2']== item) ]
    features.append((item, group.profit.mean(), group.vote_average.mean(), group.popularity.mean(), d[item]))

In [45]:

top15 = pd.DataFrame(features)
top15.columns = ["Actor","Avg. Profit", "Avg. Vote", "Avg. Popularity", "Connections with Other Important Actors)"]
top15

Out[45]:

	Actor	Avg. Profit	Avg. Vote	Avg. Popularity	Connections with Other Important Actors)
0	Scarlett Johansson	2.916515e+07	6.437500	33.564579	13
1	Russell Crowe	7.690823e+07	6.600000	33.467027	13
2	Cameron Diaz	1.206145e+08	5.994118	38.642755	14
3	Tom Cruise	2.369365e+08	6.646154	48.156016	14
4	Brad Pitt	1.353495e+08	6.813793	48.674368	14
5	Matt Damon	9.884363e+07	6.636364	36.726481	14
6	Meryl Streep	8.384653e+07	6.482609	25.091077	14
7	Christian Bale	1.388917e+08	6.935000	55.938757	15
8	Jude Law	5.998734e+07	6.371429	29.928448	15
9	Kate Winslet	1.448356e+08	7.007143	30.104512	15
10	Cate Blanchett	1.037595e+08	6.562500	31.213836	16
11	Julianne Moore	3.168168e+07	6.461111	25.764328	16
12	Julia Roberts	9.624590e+07	6.200000	22.245915	18
13	Morgan Freeman	7.366736e+07	6.447826	32.025440	21
14	Robert De Niro	4.423925e+07	6.478049	24.653944	24

In [47]:

top15['Actor1'] = top15['Actor']+ " ("+ top15['Connections with Other Important Actors)'].astype('str') + ")"

In [48]:

trace1 = go.Bar(    y=top15["Actor1"],
                        orientation = 'h',
                        x=top15["Avg. Profit"]/1000000,
                        name='Profit',
                        marker=dict(
                    color='rgb(161,215,106)'
                ))
                        

trace2 = go.Bar(y=top15["Actor1"],
                   orientation = 'h',
                    x=top15["Avg. Popularity"],
                    marker=dict(
                        color='rgb(37,52,148)'
                        #color='rgb(65,182,196)'
                    ),
                    name='Popularity')
                

trace3 = go.Bar(y=top15["Actor1"],
                   orientation = 'h',
                    x=top15["Avg. Vote"]*10,
                    name='Vote',
                    marker=dict(
                        #color='rgb(37,52,148)'
                        color='rgb(65,182,196)'
                    )
               )


data = [trace1, trace2, trace3]


updatemenus = list([
    dict(active=-1,
         x=-0.3,
         buttons=list([  
             
            dict(
                label = 'Average Profit (in Millions)',
                 method = 'update',
                 args = [{'visible': [True, False, False]}, 
                     {'title': 'Average Profit'}]),
             
             dict(
                  label = 'Average Popularity',
                 method = 'update',
                 args = [{'visible': [False, True, False]},
                     {'title': 'Average Popularity'}]),

            dict(
                 label = 'Average Vote',
                 method = 'update',
                 args = [{'visible': [False, False, True]},
                     {'title': 'Average Vote'}])
        ]),
    )
])

layout = dict(title='Average Measures for Important Actors (Select from Dropdown)', showlegend=False,
              updatemenus=updatemenus)

fig = dict(data=data, layout=layout)

iplot(fig)

In [ ]:

Geo-Visualizations¶

Processing Data for GeoVisualizations¶

In [2]:

df = pd.read_csv('2017.csv')
codes = pd.read_csv("2014_world_gdp_with_codes.csv")

codes.columns = ["Country","Gdp","Code"]
df = df.merge(codes,how = "left", on = ["Country"] )
df['happiness score 2015'] = pd.read_csv("2015.csv")["Happiness Score"]

lat = pd.read_csv("lat.csv")
lat.columns = ["code","latitude","longitude","Country"]
df = df.merge(lat[["latitude","longitude","Country"]], on = ["Country"], how = "left")

Cloropleths of Happiness Score Country-wise¶

In [4]:

data = [ dict(
        type = 'choropleth',
        locations = df['Code'],
        z = df['Happiness.Score'],
        text = df['Country'],
        colorscale = [[2.7,"rgb(5, 10, 172)"],[3.6,"rgb(40, 60, 190)"],[4.5,"rgb(70, 100, 245)"],\
            [5.4,"rgb(90, 120, 245)"],[6.3,"rgb(106, 137, 247)"],[7.2,"rgb(220, 220, 220)"]],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
                    line = dict (
                    color = 'rgb(180,180,180)',
                    width = 0.5 ) ),
        colorbar = dict( 
                      autotick = False,
                      title = 'Happiness Score'),
                      ) ]

layout = dict(
    title = 'Happiness Score',
    geo = dict( 
        showframe = False,
        showcoastlines = False,
        projection = dict(type = 'Mercator')
    )
)

fig = dict( data=data, layout=layout )
iplot(fig, validate = False)

ScatterGeo Plots/Symbol Maps of Relationship between GDP & Happiness Rank¶

In [5]:

df['text'] = df['Country'] + '<br>GDP ' + (df['Gdp']).astype(str)+' billion'
limits = [(0,30),(31,60),(61,90),(91,120),(121,160)]
colors = ["blue","green","yellow","rgb(255,65,54)","rgb(133,20,75)"]
countries = []
scale = 10

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df[lim[0]:lim[1]]
    city = dict( 
        type = 'scattergeo',
        locationmode = 'World-Map',
        lon = df_sub['longitude'],
        lat = df_sub['latitude'],
        text = df_sub['text'],
        marker = dict( 
            size = df_sub['Gdp']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area',
            title = 'Happiness Rank'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    countries.append(city)

layout = dict(
        title = '2017 World Happiness Rank (Size Proportional to GDP) <br>(Click legend to toggle traces)',
        showlegend = True,
        geo = dict(
            scope=None,
            projection=dict( type='Mercator' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict( data=countries, layout=layout )
iplot(fig, validate = False)