import pandas as pd
import numpy as np
from matplotlib import rcParams
from scipy import stats
from matplotlib.pyplot import *
from numpy import sin, exp, absolute, pi, arange
from numpy.random import normal
%matplotlib inline
rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
!ls /Users/goodvc/Data/Starwars-Analysis/resource/script/episode2
!head /Users/goodvc/Data/Starwars-Analysis/resource/script/episode2
## script meta information
ep1 = { 'path': "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode1"
, 'title' : 'THE PHANTOM MENACE', 'index':1
, 'parser' : lineParser1}
ep2 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode2"
, 'title' : 'Attack of the clones', 'index':2
, 'parser' : lineParser2}
ep3 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode3"
, 'title' : 'Revenge of the Sith', 'index':3
, 'parser' : lineParser3}
ep4 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode4"
, 'title' : 'A New Hope', 'index':4
, 'parser' : lineParser4}
ep5 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode5"
, 'title' : 'The Empire Strikes Back', 'index':5
, 'parser' : lineParser5}
ep6 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode6"
, 'title' : 'Return of the Jedi', 'index':6
, 'parser' : lineParser6}
starwars = [ep1, ep2, ep3, ep4, ep5, ep6]
def loadScenario( path ) :
f = open(path, 'r')
scenario = []
while True:
line = f.readline()
if not line: break
if len(line)>1:
scenario.append(line[:-1])
f.close()
return scenario
## episode1
def lineParser1(line, scens_pos=0):
words = line.split(maxsplit=(scens_pos+1))
scene_seps = set(['INT.', 'EXT.'])
if len(words)<(scens_pos+1) :
return {}
if words[scens_pos] in scene_seps:
return {'scene': { 'type' : words[scens_pos], 'place' : ' '.join(words[scens_pos+1:]) }}
words = line.split(sep=':', maxsplit=1)
if len(words)>1:
return {'actor':words[0].strip(),'wording':words[1].strip() }
return {}
## episode2
def lineParser2(line, word_starts='\t\t', scens_pos=0):
words = line.split(maxsplit=(scens_pos+1))
scene_seps = set(['INT.', 'EXT.'])
if len(words)<(scens_pos+1) :
return {}
if words[0] in scene_seps:
return {'scene': { 'type' : words[scens_pos], 'place' : ' '.join(words[scens_pos+1:]) }}
if line.strip().upper()==line.strip():
return {'actor':line.strip()}
if line.startswith(word_starts):
return { 'wording':line.strip() }
return {}
## episode4
def lineParser3(line):
return lineParser1(line, scens_pos=1)
## episode4
def lineParser4(line):
return lineParser2(line, word_starts=' ')
## episode4
def lineParser5(line):
return lineParser2(line)
## episode6
def lineParser6(line, word_starts='\t\t', scens_pos=1):
words = line.split(maxsplit=(scens_pos+1))
scene_seps = set(['INT', 'EXT'])
if len(words)>=(scens_pos+1) and words[scens_pos] in scene_seps:
return {'scene': { 'type' : words[scens_pos], 'place' : ' '.join(words[scens_pos+1:]) }}
if line.strip().upper()==line.strip():
return {'actor':line.strip()}
return { 'wording':line.strip() }
def parseScenario( scenario, lineParser ):
scenes, script, wording = [], [], []
actor = ''
scene_seps = set(['INT.', 'EXT.'])
index = 1
cur = -1
for line in scenario:
## line parse
parsed = lineParser(line)
## affect parsed data
if 'scene' in parsed:
data = parsed['scene']
scene = { 'index': index, 'type' : data['type'], 'place' : data['place'],'script' : [] }
scenes.append(scene)
cur = len(scenes)-1
index += 1
if cur == -1:
continue
if 'actor' in parsed:
scenes[cur]['script'].append([parsed['actor'],''])
if 'wording' in parsed and len(scenes[cur]['script'])>0:
scenes[cur]['script'][-1][1] = scenes[cur]['script'][-1][1]+' '+parsed['wording']
return scenes
scenario = loadScenario( ep6['path'] )
scenes = parseScenario(scenario[:], lineParser6 )
def lowerCheck(name):
if name.upper()==name:
return name
name = name.split()
pos=len(name)
for n, w in enumerate(name):
if w.upper() != w:
pos = n
break;
return ' '.join(name[:n])
def checkName(name):
origin = name
name = lowerCheck(name)
for ch in '\t,.(\'/:':
name = name.split(ch)[0]
if len(name)<1 :
name = origin
if len(name)>30 :
name = 'UNKNOWN'
return name
"""
for name in sw_ds.groupby(['actor']).count().sort(['type'], ascending=True).index.values.tolist():
print ( checkName(name),':',name)
"""
" \nfor name in sw_ds.groupby(['actor']).count().sort(['type'], ascending=True).index.values.tolist():\n print ( checkName(name),':',name)\n"
### 등장인물 이름 오타 수정 리스트
fix_list = {"ANAKIN (VADER)":"ANAKIN,ANAKN,ANAKNI,ANAKIN,ANAKINN,VADER,DARTH VADER"
,"QUI-GON":"GUI-GON,QUI -GON"
,"EMPEROR":"PALAPATINE,EMPEROR,PALPATINE,DARTH SIDIOUS"
,"CAPT. PANAKA":"CAPTAIN PANAKA"
,"PADME":"PADMÉ,PAMDE,AMIDALA"
,"THREEPIO":"C-3PO"
,"MACE WINDU":"MACE,WINDU,MACE-WINDU"
,"TC-14":"TC14"
,"OBI-WAN":"OBI-WAN,OBI-WAM,BEN"
,"LUKE":"LURE"
,"DARTH SIDIOUS":"DABTH SIDIOUS"
,"A":"(O.S) A"
,"JANGO FETT":"JANGO"
,"DOOKU":"COUNT DOOKU"
}
fix_dict = {}
for (origin,missed) in fix_list.items():
for word in missed.split(','):
if len(word)>0:
fix_dict[word]=origin
## starwars dataframe 만들기
dataset = []
seq = 1
for episode in starwars[:]:
scenario = loadScenario( episode['path'] )
scenes = parseScenario(scenario, episode['parser'] )
episode_idx = episode['index']
for scene in scenes:
index = scene['index']
scene_type = scene['type']
place = scene['place']
for s in scene['script']:
name = checkName(s[0])
dataset.append([seq,episode_idx, index, scene_type[:3], place, fix_dict.get(name, name), s[1]])
seq += 1
sw_ds = pd.DataFrame(dataset, columns=['sequence','episode', 'scene_no','type','place','actor', 'script'])
## 배우의 대사가 2회 이하 스크립트는 삭제
#sw_ds = sw_ds.groupby(['episode','actor']).filter(lambda x: len(x) > 2)
## 단어수
sw_ds['words'] = sw_ds.script.apply(lambda x: len(x.split()) )
## 장소 추출
### 등장인물 이름 오타 수정 리스트
fix_list = {'TATOOINE':'TATOOINE SEA',
'FOREST' : 'FOREST CLEARING,ENDOR FOREST,FOREST LANDING SITE'
}
fix_dict = {}
for (origin,missed) in fix_list.items():
for word in missed.split(','):
if len(word)>0:
fix_dict[word]=origin
def extractPlace(place):
place = place.replace(',','-')
place = place.split('-')[0].strip()
return fix_dict.get(place, place)
sw_ds['place1'] = sw_ds.place.apply(extractPlace)
## add Totally Scene_no
max_scene_no = sw_ds.groupby(['episode']).max().reset_index()[['episode','scene_no']]
max_scene_no['scene_no'] = max_scene_no['scene_no'].cumsum()
old=0
def shiftval(v):
global old
tmp_old=old;
old = v
return tmp_old
max_scene_no['scene_no'] = max_scene_no.scene_no.apply(shiftval)
max_scene_no['episode'] = max_scene_no.episode.apply( lambda x: x+1 if x<6 else 0 )
max_scene_no.iloc[5] = 0
tmp_ds = pd.merge( sw_ds, max_scene_no, on='episode', how='inner')
sw_ds['totally_scene_no'] = tmp_ds.scene_no_x + tmp_ds.scene_no_y
#draw_ds['scene_no'] = draw_ds.scene_no_x + draw_ds.scene_no_y
sw_ds[['episode','sequence','scene_no', 'place1', 'actor', 'script']][280:290]
episode | sequence | scene_no | place1 | actor | script | |
---|---|---|---|---|---|---|
280 | 1 | 281 | 48 | TATOOINE | CAPT | The Queen wishes it. She is curious about thi... |
281 | 1 | 282 | 48 | TATOOINE | PADME | I've been trained in defense... I can take ca... |
282 | 1 | 283 | 48 | TATOOINE | CAPT | Don't make me go back and tell her you refuse. |
283 | 1 | 284 | 48 | TATOOINE | QUI-GON | I don't have time to argue. But this is not a... |
284 | 1 | 285 | 49 | MOS ESPA | QUI-GON | ...moisture farms for the most part, but also... |
285 | 1 | 286 | 49 | MOS ESPA | PADME | ....like us. JAR JAR is in a constant state o... |
286 | 1 | 287 | 49 | MOS ESPA | JAR JAR | Dissen berry berry bad. (steps in ooze) |
287 | 1 | 288 | 50 | MOS ESPA | QUI-GON | We'll try one of the smaller dealers. |
288 | 1 | 289 | 51 | WATTO'S JUNK SHOP | WATTO | (subtitled) Hi chuba da naga? (What do you wa... |
289 | 1 | 290 | 51 | WATTO'S JUNK SHOP | QUI-GON | I need parts for a J-type 327 Nubian. |
## 주요인물 필터링
act_episode_ds = sw_ds.groupby(['episode','actor']).agg({'scene_no':len}).reset_index()\
.sort(['episode','actor'], ascending=False)\
.pivot(index='actor', columns='episode', values='scene_no').fillna(0)
episode_sum_ds=act_episode_ds.sum()
act_episode_ds = (act_episode_ds/episode_sum_ds)
act_episode_ds['all'] = act_episode_ds.sum(axis=1)
act_episode_ds.sort('all', ascending=False).cumsum()[[1,2,3,4,5,6]].plot(figsize=(12,6))
actor_list = set()
for (idx, ep) in act_episode_ds[[1,2,3,4,5,6]].iteritems():
ep = ep.sort(ascending=False,inplace=False).cumsum()
actor_list.update(ep[ep<0.7].index.values)
print( actor_list )
{'JAR JAR', 'THREEPIO', 'OBI-WAN', 'NUTE', 'CAPT', 'PADME', 'YODA', 'ANAKIN (VADER)', 'HAN', 'LANDO', 'LUKE', 'MACE WINDU', 'LEIA', 'DOOKU', 'QUI-GON', 'EMPEROR'}
## 주요 배경 필터링
place_episode_ds = sw_ds.groupby(['episode','place1']).agg({'scene_no':len}).reset_index()\
.sort(['episode','place1'], ascending=False)\
.pivot(index='place1', columns='episode', values='scene_no').fillna(0)
episode_sum_ds=place_episode_ds.sum()
place_episode_ds = (place_episode_ds/episode_sum_ds)
place_episode_ds['all'] = place_episode_ds.sum(axis=1)
place_episode_ds.sort('all', ascending=False).cumsum()[[1,2,3,4,5,6]].plot(figsize=(12,6))
place_list = set()
for (idx, ep) in place_episode_ds[[1,2,3,4,5,6]].iteritems():
ep = ep.sort(ascending=False,inplace=False).cumsum()
place_list.update(ep[ep<0.6].index.values)
print(place_list)
{'MILLENNIUM FALCON', 'CLOUD CITY', 'SKIFF', 'DAGOBAH SWAMP', "WATTO'S JUNK SHOP", 'DEATH STAR', 'MUSTAFAR', 'TIPOCA CITY', 'EWOK VILLAGE', 'NABOO SPACECRAFT', 'THEED', "ANAKIN'S HOVEL", 'CORUSCANT', 'SENATE BUILDING', 'FEDERATION BATTLESHIP', 'GEONOSIS', "JABBA'S THRONE ROOM", 'MOS ESPA', 'JEDI TEMPLE', "EMPEROR'S TOWER", 'HOTH', 'TATOOINE', 'TEMPLE OF THE JEDI', 'FOREST', 'HEADQUARTERS FRIGATE'}
## 주요 인물과 장소만 모음
## 주요 인물과 장소를 제외한 항목은 *EXTRA, *ETC PLACE로 명시한
sw_ds['top_actor'] = sw_ds.actor.apply(lambda x : x if x in actor_list else '*EXTRA' )
sw_ds['top_place'] = sw_ds.place1.apply(lambda x : x if x in place_list else '*ETC PLACE' )
print( "출연 배우 수(대사가있는) = ",sw_ds.actor.nunique())
print( "총 장면(Scene) 수 = ",len(sw_ds.groupby(['scene_no','episode']).count()) )
print( "배우의 전체 대사(script)수 = ",len(sw_ds))
print( "배우당 평균 대사(script)수 = ",round(sw_ds.groupby('actor').count().mean()[0]))
print( "총 배경 수 = ",len(sw_ds.groupby(['place1']).count()) )
출연 배우 수(대사가있는) = 295 총 장면(Scene) 수 = 872 배우의 전체 대사(script)수 = 5595 배우당 평균 대사(script)수 = 19.0 총 배경 수 = 192
ep_stats = sw_ds.groupby('episode').agg({ 'scene_no':np.max, 'actor':pd.Series.nunique , 'place1':pd.Series.nunique })
ep_stats.columns = ['장면수(Scene)', '등장인물수', '배경수']
ep_stats.plot(kind='bar', subplots=True, figsize=(10,8))
plt.show()
tmp_ds = sw_ds.groupby(['episode','type']).agg({'scene_no': pd.Series.nunique }).reset_index()
tmp_ds = tmp_ds.pivot(index='episode', columns='type', values='scene_no')
tmp_ds['실내장면'] = tmp_ds.INT*100 / (tmp_ds.EXT+tmp_ds.INT)
tmp_ds['실외장면'] = tmp_ds.EXT*100 / (tmp_ds.EXT+tmp_ds.INT)
tmp_ds[['실내장면','실외장면']].plot(kind='bar', stacked=True, figsize=(10,4))
ylabel('비율(%)')
plt.show()
place_stat = sw_ds.groupby(['place1']).agg( { 'scene_no': pd.Series.nunique, 'script': len, 'episode': lambda x: stats.mode(x)[0][0] } )
place_stat.columns = ['장면수','Episode','대사수']
place_stat.sort(['장면수'], ascending=False).head(30)['장면수']\
.plot(kind='bar', title = '전체 Episode의 영화배경 Top 20', figsize=(10,4) )
place_list = place_stat.sort(['장면수'], ascending=False).head(20).sort(['Episode']).index.values.tolist()
place_stat = sw_ds.groupby(['episode','top_place']).agg( { 'scene_no': pd.Series.nunique, 'script': len } ).reset_index()
place_stat = pd.pivot_table( place_stat,index='episode', columns='top_place', values='script', aggfunc=np.sum )\
.fillna(0)
place_stat.plot(kind='barh', stacked=True, figsize=(15,10) )
plt.show()
x_index = sw_ds.groupby('episode').max()['sequence'].values.tolist()
x_label = ['Ep-1','Ep-2','Ep-3','Ep-4','Ep-5','Ep-6']
def drawVlinePlot(x, t):
ax = plt.figure(figsize=(10,1.5))
vlines(x, [0], 1, alpha=0.7, linewidth=1.5, color='k')
vlines([np.median(x)], [0], 1, alpha=1, linewidth=2, color='r')
plt.xticks(x_index, x_label)
plt.yticks([])
size = len(x)
#ylabel('{0}'.format(size))
title(t)
show()
total = sw_ds.count()[0]
for place in place_list:
tmp_place_ds = sw_ds[sw_ds.place1==place][['sequence','scene_no']]
x = [0]+tmp_place_ds.sequence.apply(lambda x: x+random.random()*0.001).values.tolist()+[6000]
p = "{p} ({n}%)".format(p=place, n=round((tmp_place_ds.count()[0]*100./ total),1))
drawVlinePlot(x,p)
sw_ds['words'] = sw_ds.script.apply( lambda x: len(x.split()))
import seaborn as sns
def drawTimeline(ds, x, y) :
sns.set(style="whitegrid", palette="pastel", color_codes=True,font_scale=1.4)
rcParams['font.family'] = 'NanumGothic'
plt.figure(figsize=(14,10))
## ploting
g = sns.violinplot(data=ds, x=x, y=y, scale="width", orient='h'
, cut=1, bw=.03
, order=sw_ds.groupby(y).mean().sort('episode').index.tolist())
## x ticks
x_index = (ds.groupby('episode').max()[x]).values.tolist()
x_label = ['Ep-1','Ep-2','Ep-3[기준점]','Ep-4','Ep-5','Ep-6']
plt.xticks(x_index, x_label)
plt.xlabel('')
plt.ylabel('')
plt.xlim(0,5700)
## font size restore
sns.set(style="whitegrid", palette="pastel", color_codes=True, font_scale=1 )
drawTimeline( ds=sw_ds, x='sequence', y='top_actor')
drawTimeline( ds=sw_ds, x='sequence', y='top_place')
## dataset for https://my.infocaptor.com/free_data_visualization.php
sw_ds['trilogy'] = sw_ds.episode.apply(lambda x : 'Original(EP-1~3) Trilogy' if x>3 else 'Prequel(EP-4~6) Trilogy')
tmp_ds = sw_ds.groupby(['trilogy','top_place','top_actor']).count()[['scene_no']].reset_index()
pd.options.display.max_rows=10000
tmp_ds
#pd.options.display.max_rows=15
trilogy | top_place | top_actor | scene_no | |
---|---|---|---|---|
0 | Original(EP-1~3) Trilogy | *ETC PLACE | *EXTRA | 257 |
1 | Original(EP-1~3) Trilogy | *ETC PLACE | ANAKIN (VADER) | 50 |
2 | Original(EP-1~3) Trilogy | *ETC PLACE | EMPEROR | 18 |
3 | Original(EP-1~3) Trilogy | *ETC PLACE | HAN | 129 |
4 | Original(EP-1~3) Trilogy | *ETC PLACE | LANDO | 24 |
5 | Original(EP-1~3) Trilogy | *ETC PLACE | LEIA | 62 |
6 | Original(EP-1~3) Trilogy | *ETC PLACE | LUKE | 217 |
7 | Original(EP-1~3) Trilogy | *ETC PLACE | OBI-WAN | 43 |
8 | Original(EP-1~3) Trilogy | *ETC PLACE | THREEPIO | 124 |
9 | Original(EP-1~3) Trilogy | *ETC PLACE | YODA | 46 |
10 | Original(EP-1~3) Trilogy | CLOUD CITY | *EXTRA | 7 |
11 | Original(EP-1~3) Trilogy | CLOUD CITY | ANAKIN (VADER) | 22 |
12 | Original(EP-1~3) Trilogy | CLOUD CITY | HAN | 28 |
13 | Original(EP-1~3) Trilogy | CLOUD CITY | LANDO | 46 |
14 | Original(EP-1~3) Trilogy | CLOUD CITY | LEIA | 24 |
15 | Original(EP-1~3) Trilogy | CLOUD CITY | LUKE | 2 |
16 | Original(EP-1~3) Trilogy | CLOUD CITY | THREEPIO | 25 |
17 | Original(EP-1~3) Trilogy | DEATH STAR | *EXTRA | 87 |
18 | Original(EP-1~3) Trilogy | DEATH STAR | ANAKIN (VADER) | 36 |
19 | Original(EP-1~3) Trilogy | DEATH STAR | EMPEROR | 13 |
20 | Original(EP-1~3) Trilogy | DEATH STAR | HAN | 59 |
21 | Original(EP-1~3) Trilogy | DEATH STAR | LEIA | 35 |
22 | Original(EP-1~3) Trilogy | DEATH STAR | LUKE | 72 |
23 | Original(EP-1~3) Trilogy | DEATH STAR | OBI-WAN | 7 |
24 | Original(EP-1~3) Trilogy | DEATH STAR | THREEPIO | 25 |
25 | Original(EP-1~3) Trilogy | FOREST | *EXTRA | 12 |
26 | Original(EP-1~3) Trilogy | FOREST | HAN | 26 |
27 | Original(EP-1~3) Trilogy | FOREST | LEIA | 19 |
28 | Original(EP-1~3) Trilogy | FOREST | LUKE | 11 |
29 | Original(EP-1~3) Trilogy | FOREST | THREEPIO | 20 |
30 | Original(EP-1~3) Trilogy | HOTH | *EXTRA | 24 |
31 | Original(EP-1~3) Trilogy | HOTH | HAN | 47 |
32 | Original(EP-1~3) Trilogy | HOTH | LEIA | 13 |
33 | Original(EP-1~3) Trilogy | HOTH | LUKE | 9 |
34 | Original(EP-1~3) Trilogy | HOTH | OBI-WAN | 3 |
35 | Original(EP-1~3) Trilogy | HOTH | THREEPIO | 18 |
36 | Original(EP-1~3) Trilogy | LUKE'S SNOWSPEEDER | *EXTRA | 2 |
37 | Original(EP-1~3) Trilogy | LUKE'S SNOWSPEEDER | LUKE | 16 |
38 | Original(EP-1~3) Trilogy | LUKE'S X | *EXTRA | 12 |
39 | Original(EP-1~3) Trilogy | LUKE'S X | LUKE | 32 |
40 | Original(EP-1~3) Trilogy | LUKE'S X | OBI-WAN | 4 |
41 | Original(EP-1~3) Trilogy | MASSASSI OUTPOST | *EXTRA | 27 |
42 | Original(EP-1~3) Trilogy | MASSASSI OUTPOST | HAN | 7 |
43 | Original(EP-1~3) Trilogy | MASSASSI OUTPOST | LEIA | 2 |
44 | Original(EP-1~3) Trilogy | MASSASSI OUTPOST | LUKE | 9 |
45 | Original(EP-1~3) Trilogy | MASSASSI OUTPOST | THREEPIO | 3 |
46 | Original(EP-1~3) Trilogy | MILLENNIUM FALCON | *EXTRA | 20 |
47 | Original(EP-1~3) Trilogy | MILLENNIUM FALCON | HAN | 116 |
48 | Original(EP-1~3) Trilogy | MILLENNIUM FALCON | LANDO | 26 |
49 | Original(EP-1~3) Trilogy | MILLENNIUM FALCON | LEIA | 58 |
50 | Original(EP-1~3) Trilogy | MILLENNIUM FALCON | LUKE | 34 |
51 | Original(EP-1~3) Trilogy | MILLENNIUM FALCON | OBI-WAN | 21 |
52 | Original(EP-1~3) Trilogy | MILLENNIUM FALCON | THREEPIO | 34 |
53 | Original(EP-1~3) Trilogy | REBEL BASE | *EXTRA | 23 |
54 | Original(EP-1~3) Trilogy | REBEL BASE | HAN | 15 |
55 | Original(EP-1~3) Trilogy | REBEL BASE | LEIA | 13 |
56 | Original(EP-1~3) Trilogy | REBEL BASE | LUKE | 8 |
57 | Original(EP-1~3) Trilogy | REBEL BASE | THREEPIO | 9 |
58 | Original(EP-1~3) Trilogy | RED LEADER'S COCKPIT | *EXTRA | 16 |
59 | Original(EP-1~3) Trilogy | RED LEADER'S COCKPIT | LUKE | 1 |
60 | Original(EP-1~3) Trilogy | TATOOINE | *EXTRA | 54 |
61 | Original(EP-1~3) Trilogy | TATOOINE | HAN | 14 |
62 | Original(EP-1~3) Trilogy | TATOOINE | LUKE | 70 |
63 | Original(EP-1~3) Trilogy | TATOOINE | OBI-WAN | 33 |
64 | Original(EP-1~3) Trilogy | TATOOINE | THREEPIO | 37 |
65 | Original(EP-1~3) Trilogy | VADER'S STAR DESTROYER | *EXTRA | 32 |
66 | Original(EP-1~3) Trilogy | VADER'S STAR DESTROYER | ANAKIN (VADER) | 28 |
67 | Original(EP-1~3) Trilogy | VADER'S STAR DESTROYER | EMPEROR | 5 |
68 | Prequel(EP-4~6) Trilogy | *ETC PLACE | *EXTRA | 328 |
69 | Prequel(EP-4~6) Trilogy | *ETC PLACE | ANAKIN (VADER) | 383 |
70 | Prequel(EP-4~6) Trilogy | *ETC PLACE | CAPT | 21 |
71 | Prequel(EP-4~6) Trilogy | *ETC PLACE | DOOKU | 33 |
72 | Prequel(EP-4~6) Trilogy | *ETC PLACE | EMPEROR | 52 |
73 | Prequel(EP-4~6) Trilogy | *ETC PLACE | JAR JAR | 65 |
74 | Prequel(EP-4~6) Trilogy | *ETC PLACE | MACE WINDU | 50 |
75 | Prequel(EP-4~6) Trilogy | *ETC PLACE | NUTE | 12 |
76 | Prequel(EP-4~6) Trilogy | *ETC PLACE | OBI-WAN | 272 |
77 | Prequel(EP-4~6) Trilogy | *ETC PLACE | PADME | 200 |
78 | Prequel(EP-4~6) Trilogy | *ETC PLACE | QUI-GON | 122 |
79 | Prequel(EP-4~6) Trilogy | *ETC PLACE | THREEPIO | 13 |
80 | Prequel(EP-4~6) Trilogy | *ETC PLACE | YODA | 87 |
81 | Prequel(EP-4~6) Trilogy | CORUSCANT | *EXTRA | 111 |
82 | Prequel(EP-4~6) Trilogy | CORUSCANT | ANAKIN (VADER) | 188 |
83 | Prequel(EP-4~6) Trilogy | CORUSCANT | DOOKU | 2 |
84 | Prequel(EP-4~6) Trilogy | CORUSCANT | EMPEROR | 117 |
85 | Prequel(EP-4~6) Trilogy | CORUSCANT | JAR JAR | 9 |
86 | Prequel(EP-4~6) Trilogy | CORUSCANT | MACE WINDU | 38 |
87 | Prequel(EP-4~6) Trilogy | CORUSCANT | OBI-WAN | 104 |
88 | Prequel(EP-4~6) Trilogy | CORUSCANT | PADME | 123 |
89 | Prequel(EP-4~6) Trilogy | CORUSCANT | QUI-GON | 11 |
90 | Prequel(EP-4~6) Trilogy | CORUSCANT | THREEPIO | 19 |
91 | Prequel(EP-4~6) Trilogy | CORUSCANT | YODA | 31 |
92 | Prequel(EP-4~6) Trilogy | FEDERATION BATTLESHIP | *EXTRA | 25 |
93 | Prequel(EP-4~6) Trilogy | FEDERATION BATTLESHIP | EMPEROR | 6 |
94 | Prequel(EP-4~6) Trilogy | FEDERATION BATTLESHIP | NUTE | 24 |
95 | Prequel(EP-4~6) Trilogy | FEDERATION BATTLESHIP | OBI-WAN | 8 |
96 | Prequel(EP-4~6) Trilogy | FEDERATION BATTLESHIP | PADME | 5 |
97 | Prequel(EP-4~6) Trilogy | FEDERATION BATTLESHIP | QUI-GON | 8 |
98 | Prequel(EP-4~6) Trilogy | MOS ESPA | *EXTRA | 96 |
99 | Prequel(EP-4~6) Trilogy | MOS ESPA | ANAKIN (VADER) | 37 |
100 | Prequel(EP-4~6) Trilogy | MOS ESPA | JAR JAR | 11 |
101 | Prequel(EP-4~6) Trilogy | MOS ESPA | OBI-WAN | 1 |
102 | Prequel(EP-4~6) Trilogy | MOS ESPA | PADME | 16 |
103 | Prequel(EP-4~6) Trilogy | MOS ESPA | QUI-GON | 27 |
104 | Prequel(EP-4~6) Trilogy | MOS ESPA | THREEPIO | 3 |
105 | Prequel(EP-4~6) Trilogy | MUSTAFAR | *EXTRA | 6 |
106 | Prequel(EP-4~6) Trilogy | MUSTAFAR | ANAKIN (VADER) | 29 |
107 | Prequel(EP-4~6) Trilogy | MUSTAFAR | EMPEROR | 2 |
108 | Prequel(EP-4~6) Trilogy | MUSTAFAR | OBI-WAN | 17 |
109 | Prequel(EP-4~6) Trilogy | MUSTAFAR | PADME | 14 |
110 | Prequel(EP-4~6) Trilogy | MUSTAFAR | THREEPIO | 3 |
111 | Prequel(EP-4~6) Trilogy | NABOO SPACECRAFT | *EXTRA | 12 |
112 | Prequel(EP-4~6) Trilogy | NABOO SPACECRAFT | ANAKIN (VADER) | 14 |
113 | Prequel(EP-4~6) Trilogy | NABOO SPACECRAFT | CAPT | 9 |
114 | Prequel(EP-4~6) Trilogy | NABOO SPACECRAFT | JAR JAR | 10 |
115 | Prequel(EP-4~6) Trilogy | NABOO SPACECRAFT | OBI-WAN | 10 |
116 | Prequel(EP-4~6) Trilogy | NABOO SPACECRAFT | PADME | 17 |
117 | Prequel(EP-4~6) Trilogy | NABOO SPACECRAFT | QUI-GON | 18 |
118 | Prequel(EP-4~6) Trilogy | TATOOINE | *EXTRA | 36 |
119 | Prequel(EP-4~6) Trilogy | TATOOINE | ANAKIN (VADER) | 27 |
120 | Prequel(EP-4~6) Trilogy | TATOOINE | CAPT | 6 |
121 | Prequel(EP-4~6) Trilogy | TATOOINE | JAR JAR | 1 |
122 | Prequel(EP-4~6) Trilogy | TATOOINE | OBI-WAN | 3 |
123 | Prequel(EP-4~6) Trilogy | TATOOINE | PADME | 18 |
124 | Prequel(EP-4~6) Trilogy | TATOOINE | QUI-GON | 12 |
125 | Prequel(EP-4~6) Trilogy | TATOOINE | THREEPIO | 19 |
126 | Prequel(EP-4~6) Trilogy | THEED | *EXTRA | 16 |
127 | Prequel(EP-4~6) Trilogy | THEED | ANAKIN (VADER) | 16 |
128 | Prequel(EP-4~6) Trilogy | THEED | CAPT | 2 |
129 | Prequel(EP-4~6) Trilogy | THEED | EMPEROR | 3 |
130 | Prequel(EP-4~6) Trilogy | THEED | JAR JAR | 8 |
131 | Prequel(EP-4~6) Trilogy | THEED | MACE WINDU | 2 |
132 | Prequel(EP-4~6) Trilogy | THEED | NUTE | 5 |
133 | Prequel(EP-4~6) Trilogy | THEED | OBI-WAN | 8 |
134 | Prequel(EP-4~6) Trilogy | THEED | PADME | 10 |
135 | Prequel(EP-4~6) Trilogy | THEED | QUI-GON | 12 |
136 | Prequel(EP-4~6) Trilogy | THEED | YODA | 1 |
137 | Prequel(EP-4~6) Trilogy | TIPOCA CITY | *EXTRA | 45 |
138 | Prequel(EP-4~6) Trilogy | TIPOCA CITY | OBI-WAN | 29 |
139 | Prequel(EP-4~6) Trilogy | UTAPAU | *EXTRA | 38 |
140 | Prequel(EP-4~6) Trilogy | UTAPAU | OBI-WAN | 18 |
from scipy.spatial import distance
def cos_cdist(matrix, vector):
"""
Compute the cosine distances between each row of matrix and vector.
"""
v = vector.reshape(1, -1)
return scipy.spatial.distance.cdist(matrix, v, 'cosine').reshape(-1)
def matrix_similarity( matrix_ds, dist='cosine'):
sim_mat = []
name = []
for idx, row in matrix_ds.iterrows():
name.append(idx)
sim_mat.append(cos_cdist( matrix_ds, row ).tolist())
ds = 1-pd.DataFrame(sim_mat, columns=name, index=name)
return ds
actor_ds = pd.pivot_table( sw_ds, index='top_actor', columns='totally_scene_no', values='type', aggfunc=len, fill_value=0)
ds = matrix_similarity( actor_ds )
sns.set(style="whitegrid", palette="pastel", color_codes=True,font_scale=1.2)
cmap = sns.cubehelix_palette(as_cmap=True, rot=-.3, light=1)
ax = sns.clustermap(ds, cmap=cmap, linewidths=.5)
place_ds = pd.pivot_table( sw_ds, index='top_place', columns='actor', values='type', aggfunc=len, fill_value=0)
ds = matrix_similarity(place_ds)
cmap = sns.cubehelix_palette(as_cmap=True, rot=.3, light=1)
ax = sns.clustermap(ds, cmap=cmap, linewidths=.5 )
def extractNetwork( ds, name1, name2, min_edge=0, min_node=0):
## make graph data
edges = defaultdict(int)
nodes = defaultdict(int)
for (idx, row) in ds.iterrows():
if row[name1].startswith('*') or row[name2].startswith('*'):
continue
## sort ascending
n1,n2 = (row[name1], row[name2]) if row[name1] < row[name2] else (row[name2], row[name1])
edges[(n1,n2)] += 1
nodes[n1] += 1
nodes[n2] += 1
nodes = { n:v for (n,v) in nodes.items() if v>min_node }
edges = { (n1,n2):v for ((n1,n2),v) in edges.items() if v>min_edge and n1 in nodes and n2 in nodes }
return (edges, nodes)
def drawSWNetwork( edges, nodes, title, min_display=0 ) :
# draw graph
G=nx.Graph()
for (n,v) in edges.items():
G.add_edge(n[0],n[1],weight=v)
pos=nx.spring_layout(G, k=0.3) # positions for all nodes
plt.figure(3,figsize=(17,12))
plt.xlim(-.02,1.02)
plt.ylim(-.05,1.05)
plt.title(title, fontsize=14)
ax = nx.draw_networkx_nodes(G, pos,
nodelist=nodes.keys(),
node_color='rgbcmy',
node_size=[v*4 for v in nodes.values()],
alpha=0.8)
nx.draw_networkx_edges(G, pos, alpha=0.4, edgelist=edges.keys(), width=[n/15 for n in edges.values()])
labels = {}
for (n,v) in nodes.items():
labels[n] = '' if v<min_display else n.replace(' ','\n')
ax = nx.draw_networkx_labels(G, pos, labels, font_size=12, font_color="k")
#nx.draw(G, fontsize=10)
(edges, nodes) = extractNetwork(ds=sw_ds[sw_ds.episode>0], name1='actor', name2='place1', min_node=15)
G=nx.Graph()
for (n,v) in edges.items():
G.add_edge(n[0],n[1],weight=v)
#dc = nx.betweenness_centrality(G)
#dc = nx.degree_centrality(G)
dc = nx.betweenness.betweenness_centrality(G)
import operator
sorted_x = sorted(dc.items(), key=operator.itemgetter(1))
sorted_x.reverse()
for n,c in sorted_x[:20]:
print(n,'\t',round(c,3))
OBI-WAN 0.307 ANAKIN (VADER) 0.247 LUKE 0.117 THREEPIO 0.103 TATOOINE 0.102 PADME 0.092 MOS ESPA 0.06 DEATH STAR 0.053 CORUSCANT 0.045 QUI-GON 0.035 EMPEROR 0.035 MILLENNIUM FALCON 0.031 HAN 0.03 THEED 0.028 LEIA 0.027 YODA 0.024 TIPOCA CITY 0.023 SPACE 0.022 VADER'S STAR DESTROYER 0.02 NABOO EDGE OF SWAMP/ GRASS PLAINS 0.02
(edges, nodes) = extractNetwork(ds=sw_ds, name1='actor', name2='place1', min_node=15)
drawSWNetwork( edges, nodes, '', min_display=40)
(edges, nodes) = extractNetwork(ds=sw_ds[sw_ds.episode<4], name1='actor', name2='place1', min_node=15)
drawSWNetwork( edges, nodes, '', min_display=40)
(edges, nodes) = extractNetwork(ds=sw_ds[sw_ds.episode>3], name1='actor', name2='place1', min_node=15)
drawSWNetwork( edges, nodes, '', min_display=40)