In [1]:
import networkx as nx
import gzip
from collections import Counter
import re
In [2]:
G=nx.Graph()
In [3]:
movies={}
with gzip.open('imdb_data/imdb_s.csv.gz') as f:
    for line in f.readlines():
        if ',' not in line:
            print line
            continue
        movie,actor = line.rstrip().split(',')
        if movie not in movies: movies[movie]=[]
        movies[movie].append(actor)
In [4]:
len(movies)
Out[4]:
58642
In [5]:
movies.keys()[:10]
Out[5]:
['Him_gok_2001',
 'Jack_Napier_Show_2__The_2002',
 'Ms__Goldman_2004',
 'Me_Luv_U_Long_Time_2002',
 'Waltz_2004',
 'All_the_Stage_Is_a_World_2005',
 'Before_the_Bell_Rang_2003',
 'Ticking_Man__The_2001',
 '_Groupe_flag__2002',
 'Making_of__Double_Vision___The_2002']
In [6]:
movies['Him_gok_2001']
Out[6]:
['Chang__Ken__II_',
 'Chan__Moses',
 'Fong__Alex__I_',
 'Ha__Shiu_Sing',
 'Lam__Lap_Sam',
 'Lee__Wai_sheung',
 'Lok__Dat_Wai',
 'Summer__Danny',
 'Wan__Tin_chiu',
 'Wong__Ken__I_']
In [89]:
for movie,actors in movies.iteritems():
    #add link for each pair of actors in a given movie
    [G.add_edge(actors[i],actors[j])for i in range(len(actors)) for j in range(i+1,len(actors))]
In [93]:
nx.number_connected_components(G)
Out[93]:
3077
In [94]:
comps=nx.connected_components(G)
In [116]:
len(comps[0])
Out[116]:
182323
In [110]:
lhist=Counter([len(cc) for cc in comps])
In [115]:
sorted([size for size in lhist])[-10:]
Out[115]:
[30, 32, 33, 39, 42, 50, 52, 54, 99, 182323]
In [118]:
nx.is_connected(G)
Out[118]:
False
In [107]:
actorlist = Counter([a for m in movies for a in movies[m]])
In [108]:
actorlist.most_common(10)
Out[108]:
[('Stone__Lee__II_', 490),
 ('Steele__Lexington', 354),
 ('Lawrence__Joel__II_', 346),
 ('Davis__Mark__V_', 335),
 ('Holmes__Steve', 318),
 ('Pete__Mr_', 308),
 ('Stone__Evan__I_', 294),
 ('Marcus__Mr_', 287),
 ('Cannon__Chris__III_', 285),
 ('Wood__Mark__IV_', 278)]
In [109]:
len(actorlist)
Out[109]:
198571
In [119]:
comps[0][:10]
Out[119]:
['Shelley__Harry_Anthony',
 'Hyson__Matt',
 'Sz_les__L_szl_',
 'Kermizian__Brian',
 'Prak__David',
 'Green__David_Gordon',
 'Jolicoeur__Clermont',
 'Rist__Robbie',
 'Stein__Seymour',
 'Jurin__Jem']
In [120]:
ccG=nx.connected_component_subgraphs(G)
In [121]:
Gc=ccG[0]
In [122]:
len(Gc.nodes())
Out[122]:
182323
In [123]:
Gc.edges()[:10]
Out[123]:
[('Shelley__Harry_Anthony', 'Winsett__Jerry'),
 ('Shelley__Harry_Anthony', 'Waugh__Scott'),
 ('Shelley__Harry_Anthony', 'Gilbert__Lance'),
 ('Shelley__Harry_Anthony', 'Lee__Will_Yun'),
 ('Shelley__Harry_Anthony', 'Harrington__Adam__II_'),
 ('Shelley__Harry_Anthony', 'Lavin__Jacob'),
 ('Shelley__Harry_Anthony', 'James__Jesse__VII_'),
 ('Shelley__Harry_Anthony', 'Henderson__Martin__I_'),
 ('Shelley__Harry_Anthony', 'Kahn__Joseph'),
 ('Shelley__Harry_Anthony', 'Ashker__John')]
In [7]:
#nx.average_shortest_path_length(Gc)
In [127]:
dc=nx.degree_centrality(ccG[0])
In [130]:
dc.values()[:10]
Out[130]:
[0.00014808964359759106,
 0.0015796228650409715,
 0.0005210561533989316,
 0.00017002885005649345,
 0.00019196805651539584,
 4.387841291780476e-05,
 0.0003729665098013405,
 0.0003236032952688101,
 0.0001316352387534143,
 2.193920645890238e-05]
In [133]:
years=Counter([title[-4:] for title in movies.keys()])
In [134]:
years.most_common()
Out[134]:
[('2003', 11989),
 ('2004', 11963),
 ('2002', 10886),
 ('2001', 10219),
 ('2000', 9584),
 ('2005', 3700),
 ('2006', 285),
 ('2007', 15),
 ('2008', 1)]
In [151]:
a2005=Counter([actor for movie,actors in movies.iteritems() if movie.endswith('2005') for actor in actors])
In [152]:
a2005.most_common()[:10]
Out[152]:
[('Andersen__Kim_S_nderholm', 14),
 ('Trejo__Danny', 13),
 ('Freeman__Morgan__I_', 11),
 ('Talkington__Jonas', 11),
 ('Madsen__Michael__I_', 11),
 ('Busey__Gary', 10),
 ('Williams__Robin__I_', 10),
 ('Bennett__Jeff__I_', 10),
 ('Astin__Sean', 10),
 ('Depp__Johnny', 10)]
In [153]:
len(a2005)
Out[153]:
21785
In [193]:
Gs=nx.Graph()
for movie,actors in movies.iteritems():
    if movie.endswith('2005') and re.match(r'[AT]',movie):
    #add link for each pair of actors in a given movie
      [Gs.add_edge(actors[i],actors[j])for i in range(len(actors)) for j in range(i+1,len(actors))]
In [194]:
len(Gs.nodes()),len(Gs.edges())
Out[194]:
(3117, 33373)
In [200]:
cGs=nx.connected_component_subgraphs(Gs)
In [196]:
[len(g) for g in cGs[:10]]
Out[196]:
[655, 295, 87, 79, 58, 48, 44, 44, 38, 36]
In [201]:
myG=cGs[0]
In [210]:
len(myG.nodes())
Out[210]:
655
In [203]:
mdc=nx.degree_centrality(myG)
In [206]:
mdc.values()[:10]
Out[206]:
[0.03669724770642202,
 0.019877675840978593,
 0.010703363914373088,
 0.012232415902140673,
 0.09480122324159021,
 0.01834862385321101,
 0.013761467889908258,
 0.053516819571865444,
 0.11009174311926606,
 0.04434250764525994]
In [211]:
myG['Depp__Johnny']
Out[211]:
{'Affleck__Ben': {},
 'Bush__George': {},
 'Caan__James': {},
 'Cage__Nicolas': {},
 'Caviezel__James': {},
 'Cheadle__Don': {},
 'Chesney__Kenny': {},
 'Clapton__Eric': {},
 'Clinton__Bill': {},
 'Clooney__George': {},
 'Damon__Matt': {},
 'DeVito__Danny': {},
 'De_Niro__Robert': {},
 'DiCaprio__Leonardo': {},
 'Douglas__Michael__I_': {},
 'Downey_Jr___Robert': {},
 'Eastwood__Clint': {},
 'Foxx__Jamie': {},
 'Freeman__Morgan__I_': {},
 'Garcia__Andy__I_': {},
 'Grammer__Kelsey': {},
 'Grant__Hugh__I_': {},
 'Groban__Josh': {},
 'John__Elton': {},
 'Jones__Quincy': {},
 'Kravitz__Lenny': {},
 'Lauer__Matt': {},
 'Leno__Jay': {},
 'Lowe__Rob__I_': {},
 'Maroon_5': {},
 'Matthews__Christopher__II_': {},
 'Mayer__John__VII_': {},
 'McCormack__Eric': {},
 'McDonough__Neal': {},
 'Nelly__III_': {},
 'O_Reilly__Bill__I_': {},
 'Pitt__Brad': {},
 'Robbins__Tim__I_': {},
 'Romano__Ray__I_': {},
 'Seacrest__Ryan': {},
 'Selleck__Tom': {},
 'Sinise__Gary': {},
 'Spacey__Kevin': {},
 'Spade__David': {},
 'Tarantino__Quentin': {},
 'Waters__Roger': {},
 'Williams__Brian__III_': {},
 'Willis__Bruce__I_': {},
 'Wilson__Brian__I_': {},
 'Wonder__Stevie': {},
 'Woods__James__I_': {},
 'Wright__Bob__VI_': {},
 'Wyle__Noah': {}}
In [230]:
len(nx.degree_histogram(myG))
Out[230]:
89
In [221]:
print nx.info(myG)
Name: 
Type: Graph
Number of nodes: 655
Number of edges: 10795
Average degree:  32.9618
In [222]:
nx.density(myG)
Out[222]:
0.05040035483343838
In [224]:
degrees={n:len(myG[n]) for n in nx.nodes(myG)}
In [225]:
degs=Counter([a for x in myG for a in myG[x]])
In [227]:
degs.most_common()[:10]
Out[227]:
[('Willis__Bruce__I_', 88),
 ('Lowe__Rob__I_', 85),
 ('Armstrong__Curtis', 79),
 ('Lopez__Nick__III_', 78),
 ('Gainey__M_C_', 78),
 ('Gore__Chris', 73),
 ('Dello_Stritto__Frank_J_', 72),
 ('Schodowski__Chuck', 72),
 ('Monks__Joseph', 72),
 ('Tippett__Phil', 72)]
In [228]:
len(myG['Willis__Bruce__I_'])
Out[228]:
88
In [231]:
[a for a in myG if a.startswith('Willis')]
Out[231]:
['Willis__Bruce__I_']
In [235]:
bc=nx.betweenness_centrality(myG)
In [236]:
len(bc)
Out[236]:
655
In [241]:
sorted(bc.items(),key=lambda x:x[1])[-10:]
Out[241]:
[('Marquette__Chris', 0.1894104368920672),
 ('Lowe__Rob__I_', 0.19293178351933277),
 ('Leguizamo__John', 0.19461694086572898),
 ('Armstrong__Curtis', 0.19862689726550245),
 ('Fishburne__Laurence', 0.21505074204682226),
 ('Rule__Ja', 0.23025423318706215),
 ('Kay_ru__Artel', 0.23325887107726748),
 ('Willis__Bruce__I_', 0.23352854308429788),
 ('Stanton__Harry_Dean', 0.23797052106407604),
 ('Gainey__M_C_', 0.3448796661843011)]
In [243]:
nx.draw_networkx(myG)
In [9]:
alist= ('Lee__Christopher__I_', 'Depp__Johnny', 'Willis__Bruce__I_','Schwarzenegger__Arnold')
def keep(movie):
    for actor in alist:
        if actor in movies[movie]: return True
    return False        
#'Walken__Christopher', 'Stiller__Ben','Hoffman__Dustin',,'Smith__Will__I_','Cruise__Tom','Travolta__John','Ferrell__Will'
In [14]:
len([movie for movie in movies if keep(movie)])
Out[14]:
179
In [10]:
Gs=nx.Graph()
for movie,actors in movies.iteritems():
    if keep(movie):
    #add link for each pair of actors in a given movie
      [Gs.add_edge(actors[i],actors[j])for i in range(len(actors)) for j in range(i+1,len(actors))]
In [15]:
nx.is_connected(Gs)
Out[15]:
True
In [13]:
print nx.info(Gs)
Name: 
Type: Graph
Number of nodes: 3170
Number of edges: 109058
Average degree:  68.8063
In [12]:
hub_ego=nx.ego_graph(Gs,'Willis__Bruce__I_')
In [8]:
pos=nx.spring_layout(hub_ego)
nx.draw(hub_ego,pos,node_color='b',node_size=50,with_labels=False)
    # Draw ego as large and red
nx.draw_networkx_nodes(hub_ego,pos,nodelist=[largest_hub],node_size=300,node_color='r')
In [48]:
figure(figsize=(20,20))
nx.draw_networkx(hub_ego)
In [17]:
bc=nx.betweenness_centrality(Gs)
In [51]:
len(bc)
Out[51]:
3170
In [52]:
sorted(bc.items(),key=lambda x:x[1])[-10:]
Out[52]:
[('Brosnan__Pierce', 0.013185193060307594),
 ('Douglas__Michael__I_', 0.0132091694260583),
 ('Bates__Alan', 0.013351189575697819),
 ('Astin__Sean', 0.014019042898398188),
 ('Bloom__Orlando', 0.020403500131657353),
 ('Jackson__Samuel_L_', 0.03274249267938933),
 ('Willis__Bruce__I_', 0.14873139095079452),
 ('Lee__Christopher__I_', 0.15610768423787588),
 ('Depp__Johnny', 0.2065163401410608),
 ('Schwarzenegger__Arnold', 0.2130729583642646)]
In [18]:
for a in alist: print a,len(Gs[a])
Lee__Christopher__I_ 758
Depp__Johnny 969
Willis__Bruce__I_ 806
Schwarzenegger__Arnold 1326
In [19]:
for a in alist: print nx.eccentricity(Gs,a)
3
2
3
3
In [20]:
[(a,b,nx.shortest_path(Gs,a,b)) for a in alist for b in alist if a!=b]
Out[20]:
[('Lee__Christopher__I_',
  'Depp__Johnny',
  ['Lee__Christopher__I_', 'Depp__Johnny']),
 ('Lee__Christopher__I_',
  'Willis__Bruce__I_',
  ['Lee__Christopher__I_', 'Depp__Johnny', 'Willis__Bruce__I_']),
 ('Lee__Christopher__I_',
  'Schwarzenegger__Arnold',
  ['Lee__Christopher__I_', 'Christensen__Hayden', 'Schwarzenegger__Arnold']),
 ('Depp__Johnny',
  'Lee__Christopher__I_',
  ['Depp__Johnny', 'Lee__Christopher__I_']),
 ('Depp__Johnny', 'Willis__Bruce__I_', ['Depp__Johnny', 'Willis__Bruce__I_']),
 ('Depp__Johnny',
  'Schwarzenegger__Arnold',
  ['Depp__Johnny', 'Schwarzenegger__Arnold']),
 ('Willis__Bruce__I_',
  'Lee__Christopher__I_',
  ['Willis__Bruce__I_', 'Depp__Johnny', 'Lee__Christopher__I_']),
 ('Willis__Bruce__I_', 'Depp__Johnny', ['Willis__Bruce__I_', 'Depp__Johnny']),
 ('Willis__Bruce__I_',
  'Schwarzenegger__Arnold',
  ['Willis__Bruce__I_', 'Schwarzenegger__Arnold']),
 ('Schwarzenegger__Arnold',
  'Lee__Christopher__I_',
  ['Schwarzenegger__Arnold', 'Christensen__Hayden', 'Lee__Christopher__I_']),
 ('Schwarzenegger__Arnold',
  'Depp__Johnny',
  ['Schwarzenegger__Arnold', 'Depp__Johnny']),
 ('Schwarzenegger__Arnold',
  'Willis__Bruce__I_',
  ['Schwarzenegger__Arnold', 'Willis__Bruce__I_'])]
In [21]:
nx.eccentricity(Gs,'Christensen__Hayden')
Out[21]:
3
In [67]:
len(Gs['Christensen__Hayden'])
Out[67]:
150
In [68]:
degs=Counter([a for x in Gs for a in myG[x]])
In [76]:
nx.eccentricity(Gs,'Grint__Rupert')
Out[76]:
3
In [77]:
nx.diameter(Gs)
Out[77]:
4
In [80]:
links=[('A','B'),('A','C'),('A','D'),('A','E'),('B','C'),
('B','F'),('C','F'),('D','G'),('D','H'),('E','H'),
('F','I'),('G','I'),('G','J'),('H','J'),
('I','K'),('J','K')]
H=nx.Graph()
H.add_edges_from(links)
In [81]:
nx.draw_networkx(H)