스타워즈 시리즈 영화 대본 분석¶

데이터 랭글링¶

startwars대본의 스크립트를 파싱한다.¶

In [2208]:

import pandas as pd
import numpy as np
from matplotlib import rcParams
from scipy import stats

from matplotlib.pyplot import *
from numpy import sin, exp,  absolute, pi, arange
from numpy.random import normal

%matplotlib inline
rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})

In [ ]:

!ls /Users/goodvc/Data/Starwars-Analysis/resource/script/episode2
!head /Users/goodvc/Data/Starwars-Analysis/resource/script/episode2

In [354]:

## script meta information 
ep1 = { 'path': "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode1"
      , 'title' :  'THE PHANTOM MENACE', 'index':1
      , 'parser' : lineParser1}
ep2 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode2"
      , 'title' :  'Attack of the clones', 'index':2
      , 'parser' : lineParser2}
ep3 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode3"
      , 'title' :  'Revenge of the Sith', 'index':3 
      , 'parser' : lineParser3}
ep4 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode4"
      , 'title' :  'A New Hope', 'index':4
      , 'parser' : lineParser4}
ep5 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode5"
      , 'title' :  'The Empire Strikes Back', 'index':5
      , 'parser' : lineParser5}
ep6 = { 'path' : "/Users/goodvc/Data/Starwars-Analysis/resource/script/episode6"
      , 'title' :  'Return of the Jedi', 'index':6
      , 'parser' : lineParser6}

starwars = [ep1, ep2, ep3, ep4, ep5, ep6]

In [90]:

def loadScenario( path ) :
    f = open(path, 'r')
    scenario = []
    while True: 
        line = f.readline()
        if not line: break
        if len(line)>1:
            scenario.append(line[:-1])

    f.close()
    return scenario

In [378]:

## episode1
def lineParser1(line, scens_pos=0):
    words = line.split(maxsplit=(scens_pos+1))
    scene_seps = set(['INT.', 'EXT.'])
    if len(words)<(scens_pos+1) :
        return {}        
    
    if words[scens_pos] in scene_seps:
        return {'scene': { 'type' : words[scens_pos], 'place' : ' '.join(words[scens_pos+1:]) }}
    
    words = line.split(sep=':', maxsplit=1)
    if len(words)>1:
        return {'actor':words[0].strip(),'wording':words[1].strip() }    
    return {}

In [379]:

## episode2
def lineParser2(line, word_starts='\t\t', scens_pos=0):
    words = line.split(maxsplit=(scens_pos+1))
    scene_seps = set(['INT.', 'EXT.'])
    if len(words)<(scens_pos+1) :
        return {}        
    
    if words[0] in scene_seps:
        return {'scene': { 'type' : words[scens_pos], 'place' : ' '.join(words[scens_pos+1:]) }}
    
    if line.strip().upper()==line.strip():
        return {'actor':line.strip()}
    
    if line.startswith(word_starts):
        return { 'wording':line.strip() }
    return {}

In [369]:

## episode4 
def lineParser3(line):
    return lineParser1(line, scens_pos=1)

In [370]:

## episode4 
def lineParser4(line):
    return lineParser2(line, word_starts='                   ')

In [371]:

## episode4 
def lineParser5(line):
    return lineParser2(line)

In [373]:

## episode6
def lineParser6(line, word_starts='\t\t', scens_pos=1):
    words = line.split(maxsplit=(scens_pos+1))
    scene_seps = set(['INT', 'EXT'])
      
    if len(words)>=(scens_pos+1) and words[scens_pos] in scene_seps:
        return {'scene': { 'type' : words[scens_pos], 'place' : ' '.join(words[scens_pos+1:]) }}
    
    if line.strip().upper()==line.strip():
        return {'actor':line.strip()}
    
    return { 'wording':line.strip() }

In [1161]:

def parseScenario( scenario, lineParser ):
    scenes, script, wording = [], [], []
    actor = ''
    scene_seps = set(['INT.', 'EXT.'])
    index = 1
    cur = -1
    for line in scenario:
        ## line parse
        parsed = lineParser(line)
        ## affect parsed data 
        if 'scene' in parsed:
            data = parsed['scene']
            scene = { 'index': index, 'type' : data['type'], 'place' : data['place'],'script' : [] }
            scenes.append(scene)
            cur = len(scenes)-1
            index += 1
        
        if cur == -1:
            continue
    
        if 'actor' in parsed:
            scenes[cur]['script'].append([parsed['actor'],''])
        
        if 'wording' in parsed and len(scenes[cur]['script'])>0:
            scenes[cur]['script'][-1][1] = scenes[cur]['script'][-1][1]+' '+parsed['wording']
    return scenes

scenario = loadScenario( ep6['path'] )
scenes = parseScenario(scenario[:], lineParser6 )

In [1639]:

def lowerCheck(name):
    if name.upper()==name:
        return name
    
    name = name.split()
    pos=len(name)
    for n, w in enumerate(name):
        if w.upper() != w:
            pos = n
            break;
    return ' '.join(name[:n])

def checkName(name):
    origin = name
    name = lowerCheck(name)
    for ch in '\t,.(\'/:':
        name = name.split(ch)[0]
    if len(name)<1 :
        name = origin
    if len(name)>30 :
        name = 'UNKNOWN'
    return name
"""    
for name in sw_ds.groupby(['actor']).count().sort(['type'], ascending=True).index.values.tolist():
    print ( checkName(name),':',name)
"""

Out[1639]:

"    \nfor name in sw_ds.groupby(['actor']).count().sort(['type'], ascending=True).index.values.tolist():\n    print ( checkName(name),':',name)\n"

In [2192]:

### 등장인물 이름 오타 수정 리스트 
fix_list = {"ANAKIN (VADER)":"ANAKIN,ANAKN,ANAKNI,ANAKIN,ANAKINN,VADER,DARTH VADER"
 ,"QUI-GON":"GUI-GON,QUI -GON"
 ,"EMPEROR":"PALAPATINE,EMPEROR,PALPATINE,DARTH SIDIOUS"
 ,"CAPT. PANAKA":"CAPTAIN PANAKA"
 ,"PADME":"PADMÉ,PAMDE,AMIDALA"
 ,"THREEPIO":"C-3PO"
 ,"MACE WINDU":"MACE,WINDU,MACE-WINDU"
 ,"TC-14":"TC14"
 ,"OBI-WAN":"OBI-WAN,OBI-WAM,BEN"
 ,"LUKE":"LURE"
 ,"DARTH SIDIOUS":"DABTH SIDIOUS"
 ,"A":"(O.S) A"
 ,"JANGO FETT":"JANGO"
 ,"DOOKU":"COUNT DOOKU"
}
fix_dict = {}
for (origin,missed) in fix_list.items():
    for word in missed.split(','):
        if len(word)>0:
            fix_dict[word]=origin

In [2193]:

## starwars dataframe 만들기 
dataset = []
seq = 1
for episode in starwars[:]:
    scenario = loadScenario( episode['path'] )
    scenes = parseScenario(scenario, episode['parser']  )
    episode_idx = episode['index']
    for scene in scenes:
        index = scene['index']
        scene_type = scene['type']
        place = scene['place']
        for s in scene['script']:
            name = checkName(s[0])
            dataset.append([seq,episode_idx, index, scene_type[:3], place, fix_dict.get(name, name), s[1]])
            seq += 1

sw_ds = pd.DataFrame(dataset, columns=['sequence','episode', 'scene_no','type','place','actor', 'script'])

In [2194]:

## 배우의 대사가 2회 이하 스크립트는 삭제 
#sw_ds = sw_ds.groupby(['episode','actor']).filter(lambda x: len(x) > 2)
## 단어수
sw_ds['words'] = sw_ds.script.apply(lambda x: len(x.split()) )

In [2195]:

## 장소 추출 
### 등장인물 이름 오타 수정 리스트 
fix_list = {'TATOOINE':'TATOOINE SEA',
            'FOREST' : 'FOREST CLEARING,ENDOR FOREST,FOREST LANDING SITE' 
}
fix_dict = {}
for (origin,missed) in fix_list.items():
    for word in missed.split(','):
        if len(word)>0:
            fix_dict[word]=origin
            
def extractPlace(place):
    place = place.replace(',','-')
    place = place.split('-')[0].strip()
    return fix_dict.get(place, place)
sw_ds['place1'] = sw_ds.place.apply(extractPlace)

In [2196]:

## add Totally Scene_no
max_scene_no = sw_ds.groupby(['episode']).max().reset_index()[['episode','scene_no']]
max_scene_no['scene_no'] = max_scene_no['scene_no'].cumsum()

In [2197]:

old=0
def shiftval(v):
    global old
    tmp_old=old;
    old = v
    return tmp_old
max_scene_no['scene_no'] = max_scene_no.scene_no.apply(shiftval)

In [2198]:

max_scene_no['episode'] = max_scene_no.episode.apply( lambda x: x+1 if x<6 else 0  )
max_scene_no.iloc[5] = 0

In [2200]:

tmp_ds = pd.merge( sw_ds, max_scene_no, on='episode', how='inner')
sw_ds['totally_scene_no'] = tmp_ds.scene_no_x + tmp_ds.scene_no_y
#draw_ds['scene_no'] = draw_ds.scene_no_x + draw_ds.scene_no_y

In [2201]:

sw_ds[['episode','sequence','scene_no', 'place1', 'actor', 'script']][280:290]

Out[2201]:

	episode	sequence	scene_no	place1	actor	script
280	1	281	48	TATOOINE	CAPT	The Queen wishes it. She is curious about thi...
281	1	282	48	TATOOINE	PADME	I've been trained in defense... I can take ca...
282	1	283	48	TATOOINE	CAPT	Don't make me go back and tell her you refuse.
283	1	284	48	TATOOINE	QUI-GON	I don't have time to argue. But this is not a...
284	1	285	49	MOS ESPA	QUI-GON	...moisture farms for the most part, but also...
285	1	286	49	MOS ESPA	PADME	....like us. JAR JAR is in a constant state o...
286	1	287	49	MOS ESPA	JAR JAR	Dissen berry berry bad. (steps in ooze)
287	1	288	50	MOS ESPA	QUI-GON	We'll try one of the smaller dealers.
288	1	289	51	WATTO'S JUNK SHOP	WATTO	(subtitled) Hi chuba da naga? (What do you wa...
289	1	290	51	WATTO'S JUNK SHOP	QUI-GON	I need parts for a J-type 327 Nubian.

주요 인물과 주요 장소의 추출¶

데이터 비주얼라이제이션의 적절한 표현(가독성)을 위해 중요한 인물과 배경 장소 중심으로 분석(표현)한다
주요 인물 : 각 Episode별 주요 대사의 70%를 점유하는 인물
주요 지역 : 각 Episode별 영화 배경의 60%를 점유하는 배경

In [2202]:

## 주요인물 필터링 
act_episode_ds = sw_ds.groupby(['episode','actor']).agg({'scene_no':len}).reset_index()\
    .sort(['episode','actor'], ascending=False)\
    .pivot(index='actor', columns='episode', values='scene_no').fillna(0)

episode_sum_ds=act_episode_ds.sum()
act_episode_ds = (act_episode_ds/episode_sum_ds)
act_episode_ds['all'] = act_episode_ds.sum(axis=1)
act_episode_ds.sort('all', ascending=False).cumsum()[[1,2,3,4,5,6]].plot(figsize=(12,6))

actor_list = set()
for (idx, ep) in act_episode_ds[[1,2,3,4,5,6]].iteritems():
    ep = ep.sort(ascending=False,inplace=False).cumsum()
    actor_list.update(ep[ep<0.7].index.values)
print( actor_list )

{'JAR JAR', 'THREEPIO', 'OBI-WAN', 'NUTE', 'CAPT', 'PADME', 'YODA', 'ANAKIN (VADER)', 'HAN', 'LANDO', 'LUKE', 'MACE WINDU', 'LEIA', 'DOOKU', 'QUI-GON', 'EMPEROR'}

In [2203]:

## 주요 배경 필터링 
place_episode_ds = sw_ds.groupby(['episode','place1']).agg({'scene_no':len}).reset_index()\
    .sort(['episode','place1'], ascending=False)\
    .pivot(index='place1', columns='episode', values='scene_no').fillna(0)

episode_sum_ds=place_episode_ds.sum()
place_episode_ds = (place_episode_ds/episode_sum_ds)
place_episode_ds['all'] = place_episode_ds.sum(axis=1)
place_episode_ds.sort('all', ascending=False).cumsum()[[1,2,3,4,5,6]].plot(figsize=(12,6))

place_list = set()
for (idx, ep) in place_episode_ds[[1,2,3,4,5,6]].iteritems():
    ep = ep.sort(ascending=False,inplace=False).cumsum()
    place_list.update(ep[ep<0.6].index.values)
print(place_list)

{'MILLENNIUM FALCON', 'CLOUD CITY', 'SKIFF', 'DAGOBAH SWAMP', "WATTO'S JUNK SHOP", 'DEATH STAR', 'MUSTAFAR', 'TIPOCA CITY', 'EWOK VILLAGE', 'NABOO SPACECRAFT', 'THEED', "ANAKIN'S HOVEL", 'CORUSCANT', 'SENATE BUILDING', 'FEDERATION BATTLESHIP', 'GEONOSIS', "JABBA'S THRONE ROOM", 'MOS ESPA', 'JEDI TEMPLE', "EMPEROR'S TOWER", 'HOTH', 'TATOOINE', 'TEMPLE OF THE JEDI', 'FOREST', 'HEADQUARTERS FRIGATE'}

In [2317]:

## 주요 인물과 장소만 모음 
## 주요 인물과 장소를 제외한 항목은 *EXTRA, *ETC PLACE로 명시한 
sw_ds['top_actor'] = sw_ds.actor.apply(lambda x : x if x in actor_list else '*EXTRA' )
sw_ds['top_place'] = sw_ds.place1.apply(lambda x : x if x in place_list else '*ETC PLACE' )

기본적인 통계 데이터를 보자¶

기본적인 스탯 정보

In [2206]:

print( "출연 배우 수(대사가있는) = ",sw_ds.actor.nunique())
print( "총 장면(Scene) 수 = ",len(sw_ds.groupby(['scene_no','episode']).count()) ) 
print( "배우의 전체 대사(script)수 = ",len(sw_ds))
print( "배우당 평균 대사(script)수 = ",round(sw_ds.groupby('actor').count().mean()[0]))
print( "총 배경 수 = ",len(sw_ds.groupby(['place1']).count()) ) 

출연 배우 수(대사가있는) =  295
총 장면(Scene) 수 =  872
배우의 전체 대사(script)수 =  5595
배우당 평균 대사(script)수 =  19.0
총 배경 수 =  192

episode별 장면수, 등장인물수, 배경수 통계¶

Episode별 Scene수를 보면 최초에 제작된 Episode4(A New Hope) 가 장면수가 478개로 가장많다.

In [2209]:

ep_stats = sw_ds.groupby('episode').agg({ 'scene_no':np.max, 'actor':pd.Series.nunique , 'place1':pd.Series.nunique })
ep_stats.columns = ['장면수(Scene)', '등장인물수', '배경수']
ep_stats.plot(kind='bar', subplots=True, figsize=(10,8))
plt.show()

episode별 INT/EXT Scene 비율¶

In [2210]:

tmp_ds = sw_ds.groupby(['episode','type']).agg({'scene_no': pd.Series.nunique }).reset_index()
tmp_ds = tmp_ds.pivot(index='episode', columns='type', values='scene_no')
tmp_ds['실내장면'] = tmp_ds.INT*100 / (tmp_ds.EXT+tmp_ds.INT)
tmp_ds['실외장면'] = tmp_ds.EXT*100 / (tmp_ds.EXT+tmp_ds.INT)
tmp_ds[['실내장면','실외장면']].plot(kind='bar', stacked=True, figsize=(10,4))
ylabel('비율(%)')
plt.show()

영화가 어떤 곳에서 가장 많이 전개 되었는가? ( Scene Place 통계 )¶

전체 영화 배경

In [2211]:

place_stat = sw_ds.groupby(['place1']).agg( { 'scene_no': pd.Series.nunique, 'script': len, 'episode': lambda x: stats.mode(x)[0][0] } )
place_stat.columns = ['장면수','Episode','대사수']                  
place_stat.sort(['장면수'], ascending=False).head(30)['장면수']\
          .plot(kind='bar', title = '전체 Episode의 영화배경 Top 20', figsize=(10,4) )
place_list = place_stat.sort(['장면수'], ascending=False).head(20).sort(['Episode']).index.values.tolist()

episode별 배경 비율

In [2212]:

place_stat = sw_ds.groupby(['episode','top_place']).agg( { 'scene_no': pd.Series.nunique, 'script': len } ).reset_index()
place_stat = pd.pivot_table( place_stat,index='episode', columns='top_place', values='script', aggfunc=np.sum )\
               .fillna(0)

place_stat.plot(kind='barh', stacked=True, figsize=(15,10) )
plt.show()

배경 Top20의 영화내 분포도¶

In [2213]:

x_index = sw_ds.groupby('episode').max()['sequence'].values.tolist()
x_label = ['Ep-1','Ep-2','Ep-3','Ep-4','Ep-5','Ep-6']

def drawVlinePlot(x, t):
    ax = plt.figure(figsize=(10,1.5))
    vlines(x, [0], 1, alpha=0.7, linewidth=1.5, color='k')
    vlines([np.median(x)], [0], 1, alpha=1, linewidth=2, color='r')
    plt.xticks(x_index, x_label)
    plt.yticks([])
    size = len(x)
    #ylabel('{0}'.format(size))
    title(t)
    show()

total = sw_ds.count()[0]
for place in place_list:
    tmp_place_ds =  sw_ds[sw_ds.place1==place][['sequence','scene_no']]
    x = [0]+tmp_place_ds.sequence.apply(lambda x: x+random.random()*0.001).values.tolist()+[6000]
    p = "{p} ({n}%)".format(p=place, n=round((tmp_place_ds.count()[0]*100./ total),1))
    drawVlinePlot(x,p)

등장 인물별 대사 빈도 추출¶

각 씬별 대사 횟수 분포

In [534]:

sw_ds['words'] = sw_ds.script.apply( lambda x: len(x.split()))

In [2281]:

import seaborn as sns

def drawTimeline(ds, x, y) :
    sns.set(style="whitegrid", palette="pastel", color_codes=True,font_scale=1.4)
    rcParams['font.family'] = 'NanumGothic'
    plt.figure(figsize=(14,10))
    
    ## ploting
    g = sns.violinplot(data=ds, x=x, y=y, scale="width", orient='h'
                       , cut=1, bw=.03
                       , order=sw_ds.groupby(y).mean().sort('episode').index.tolist())
    ## x ticks
    x_index = (ds.groupby('episode').max()[x]).values.tolist()
    x_label = ['Ep-1','Ep-2','Ep-3[기준점]','Ep-4','Ep-5','Ep-6']
    plt.xticks(x_index, x_label)
    plt.xlabel('')
    plt.ylabel('')
    plt.xlim(0,5700)

    ## font size restore
    sns.set(style="whitegrid", palette="pastel", color_codes=True, font_scale=1 )

인물별 타임라인¶

In [2282]:

drawTimeline( ds=sw_ds, x='sequence', y='top_actor')   

영화 배경별 타임라인¶

In [2283]:

drawTimeline( ds=sw_ds, x='sequence', y='top_place')   

Sankey chart(https://my.infocaptor.com/free_data_visualization.php) 에 넣을 데이터 추출¶

아래 결과 데이터를 간단히 데이터 비주얼라이제이션을 해주는 아래 사이트에 넣고 결과 출력

In [2320]:

## dataset for https://my.infocaptor.com/free_data_visualization.php
sw_ds['trilogy'] = sw_ds.episode.apply(lambda x : 'Original(EP-1~3) Trilogy' if x>3 else 'Prequel(EP-4~6) Trilogy')
tmp_ds = sw_ds.groupby(['trilogy','top_place','top_actor']).count()[['scene_no']].reset_index()
pd.options.display.max_rows=10000

tmp_ds
#pd.options.display.max_rows=15

Out[2320]:

	trilogy	top_place	top_actor	scene_no
0	Original(EP-1~3) Trilogy	*ETC PLACE	*EXTRA	257
1	Original(EP-1~3) Trilogy	*ETC PLACE	ANAKIN (VADER)	50
2	Original(EP-1~3) Trilogy	*ETC PLACE	EMPEROR	18
3	Original(EP-1~3) Trilogy	*ETC PLACE	HAN	129
4	Original(EP-1~3) Trilogy	*ETC PLACE	LANDO	24
5	Original(EP-1~3) Trilogy	*ETC PLACE	LEIA	62
6	Original(EP-1~3) Trilogy	*ETC PLACE	LUKE	217
7	Original(EP-1~3) Trilogy	*ETC PLACE	OBI-WAN	43
8	Original(EP-1~3) Trilogy	*ETC PLACE	THREEPIO	124
9	Original(EP-1~3) Trilogy	*ETC PLACE	YODA	46
10	Original(EP-1~3) Trilogy	CLOUD CITY	*EXTRA	7
11	Original(EP-1~3) Trilogy	CLOUD CITY	ANAKIN (VADER)	22
12	Original(EP-1~3) Trilogy	CLOUD CITY	HAN	28
13	Original(EP-1~3) Trilogy	CLOUD CITY	LANDO	46
14	Original(EP-1~3) Trilogy	CLOUD CITY	LEIA	24
15	Original(EP-1~3) Trilogy	CLOUD CITY	LUKE	2
16	Original(EP-1~3) Trilogy	CLOUD CITY	THREEPIO	25
17	Original(EP-1~3) Trilogy	DEATH STAR	*EXTRA	87
18	Original(EP-1~3) Trilogy	DEATH STAR	ANAKIN (VADER)	36
19	Original(EP-1~3) Trilogy	DEATH STAR	EMPEROR	13
20	Original(EP-1~3) Trilogy	DEATH STAR	HAN	59
21	Original(EP-1~3) Trilogy	DEATH STAR	LEIA	35
22	Original(EP-1~3) Trilogy	DEATH STAR	LUKE	72
23	Original(EP-1~3) Trilogy	DEATH STAR	OBI-WAN	7
24	Original(EP-1~3) Trilogy	DEATH STAR	THREEPIO	25
25	Original(EP-1~3) Trilogy	FOREST	*EXTRA	12
26	Original(EP-1~3) Trilogy	FOREST	HAN	26
27	Original(EP-1~3) Trilogy	FOREST	LEIA	19
28	Original(EP-1~3) Trilogy	FOREST	LUKE	11
29	Original(EP-1~3) Trilogy	FOREST	THREEPIO	20
30	Original(EP-1~3) Trilogy	HOTH	*EXTRA	24
31	Original(EP-1~3) Trilogy	HOTH	HAN	47
32	Original(EP-1~3) Trilogy	HOTH	LEIA	13
33	Original(EP-1~3) Trilogy	HOTH	LUKE	9
34	Original(EP-1~3) Trilogy	HOTH	OBI-WAN	3
35	Original(EP-1~3) Trilogy	HOTH	THREEPIO	18
36	Original(EP-1~3) Trilogy	LUKE'S SNOWSPEEDER	*EXTRA	2
37	Original(EP-1~3) Trilogy	LUKE'S SNOWSPEEDER	LUKE	16
38	Original(EP-1~3) Trilogy	LUKE'S X	*EXTRA	12
39	Original(EP-1~3) Trilogy	LUKE'S X	LUKE	32
40	Original(EP-1~3) Trilogy	LUKE'S X	OBI-WAN	4
41	Original(EP-1~3) Trilogy	MASSASSI OUTPOST	*EXTRA	27
42	Original(EP-1~3) Trilogy	MASSASSI OUTPOST	HAN	7
43	Original(EP-1~3) Trilogy	MASSASSI OUTPOST	LEIA	2
44	Original(EP-1~3) Trilogy	MASSASSI OUTPOST	LUKE	9
45	Original(EP-1~3) Trilogy	MASSASSI OUTPOST	THREEPIO	3
46	Original(EP-1~3) Trilogy	MILLENNIUM FALCON	*EXTRA	20
47	Original(EP-1~3) Trilogy	MILLENNIUM FALCON	HAN	116
48	Original(EP-1~3) Trilogy	MILLENNIUM FALCON	LANDO	26
49	Original(EP-1~3) Trilogy	MILLENNIUM FALCON	LEIA	58
50	Original(EP-1~3) Trilogy	MILLENNIUM FALCON	LUKE	34
51	Original(EP-1~3) Trilogy	MILLENNIUM FALCON	OBI-WAN	21
52	Original(EP-1~3) Trilogy	MILLENNIUM FALCON	THREEPIO	34
53	Original(EP-1~3) Trilogy	REBEL BASE	*EXTRA	23
54	Original(EP-1~3) Trilogy	REBEL BASE	HAN	15
55	Original(EP-1~3) Trilogy	REBEL BASE	LEIA	13
56	Original(EP-1~3) Trilogy	REBEL BASE	LUKE	8
57	Original(EP-1~3) Trilogy	REBEL BASE	THREEPIO	9
58	Original(EP-1~3) Trilogy	RED LEADER'S COCKPIT	*EXTRA	16
59	Original(EP-1~3) Trilogy	RED LEADER'S COCKPIT	LUKE	1
60	Original(EP-1~3) Trilogy	TATOOINE	*EXTRA	54
61	Original(EP-1~3) Trilogy	TATOOINE	HAN	14
62	Original(EP-1~3) Trilogy	TATOOINE	LUKE	70
63	Original(EP-1~3) Trilogy	TATOOINE	OBI-WAN	33
64	Original(EP-1~3) Trilogy	TATOOINE	THREEPIO	37
65	Original(EP-1~3) Trilogy	VADER'S STAR DESTROYER	*EXTRA	32
66	Original(EP-1~3) Trilogy	VADER'S STAR DESTROYER	ANAKIN (VADER)	28
67	Original(EP-1~3) Trilogy	VADER'S STAR DESTROYER	EMPEROR	5
68	Prequel(EP-4~6) Trilogy	*ETC PLACE	*EXTRA	328
69	Prequel(EP-4~6) Trilogy	*ETC PLACE	ANAKIN (VADER)	383
70	Prequel(EP-4~6) Trilogy	*ETC PLACE	CAPT	21
71	Prequel(EP-4~6) Trilogy	*ETC PLACE	DOOKU	33
72	Prequel(EP-4~6) Trilogy	*ETC PLACE	EMPEROR	52
73	Prequel(EP-4~6) Trilogy	*ETC PLACE	JAR JAR	65
74	Prequel(EP-4~6) Trilogy	*ETC PLACE	MACE WINDU	50
75	Prequel(EP-4~6) Trilogy	*ETC PLACE	NUTE	12
76	Prequel(EP-4~6) Trilogy	*ETC PLACE	OBI-WAN	272
77	Prequel(EP-4~6) Trilogy	*ETC PLACE	PADME	200
78	Prequel(EP-4~6) Trilogy	*ETC PLACE	QUI-GON	122
79	Prequel(EP-4~6) Trilogy	*ETC PLACE	THREEPIO	13
80	Prequel(EP-4~6) Trilogy	*ETC PLACE	YODA	87
81	Prequel(EP-4~6) Trilogy	CORUSCANT	*EXTRA	111
82	Prequel(EP-4~6) Trilogy	CORUSCANT	ANAKIN (VADER)	188
83	Prequel(EP-4~6) Trilogy	CORUSCANT	DOOKU	2
84	Prequel(EP-4~6) Trilogy	CORUSCANT	EMPEROR	117
85	Prequel(EP-4~6) Trilogy	CORUSCANT	JAR JAR	9
86	Prequel(EP-4~6) Trilogy	CORUSCANT	MACE WINDU	38
87	Prequel(EP-4~6) Trilogy	CORUSCANT	OBI-WAN	104
88	Prequel(EP-4~6) Trilogy	CORUSCANT	PADME	123
89	Prequel(EP-4~6) Trilogy	CORUSCANT	QUI-GON	11
90	Prequel(EP-4~6) Trilogy	CORUSCANT	THREEPIO	19
91	Prequel(EP-4~6) Trilogy	CORUSCANT	YODA	31
92	Prequel(EP-4~6) Trilogy	FEDERATION BATTLESHIP	*EXTRA	25
93	Prequel(EP-4~6) Trilogy	FEDERATION BATTLESHIP	EMPEROR	6
94	Prequel(EP-4~6) Trilogy	FEDERATION BATTLESHIP	NUTE	24
95	Prequel(EP-4~6) Trilogy	FEDERATION BATTLESHIP	OBI-WAN	8
96	Prequel(EP-4~6) Trilogy	FEDERATION BATTLESHIP	PADME	5
97	Prequel(EP-4~6) Trilogy	FEDERATION BATTLESHIP	QUI-GON	8
98	Prequel(EP-4~6) Trilogy	MOS ESPA	*EXTRA	96
99	Prequel(EP-4~6) Trilogy	MOS ESPA	ANAKIN (VADER)	37
100	Prequel(EP-4~6) Trilogy	MOS ESPA	JAR JAR	11
101	Prequel(EP-4~6) Trilogy	MOS ESPA	OBI-WAN	1
102	Prequel(EP-4~6) Trilogy	MOS ESPA	PADME	16
103	Prequel(EP-4~6) Trilogy	MOS ESPA	QUI-GON	27
104	Prequel(EP-4~6) Trilogy	MOS ESPA	THREEPIO	3
105	Prequel(EP-4~6) Trilogy	MUSTAFAR	*EXTRA	6
106	Prequel(EP-4~6) Trilogy	MUSTAFAR	ANAKIN (VADER)	29
107	Prequel(EP-4~6) Trilogy	MUSTAFAR	EMPEROR	2
108	Prequel(EP-4~6) Trilogy	MUSTAFAR	OBI-WAN	17
109	Prequel(EP-4~6) Trilogy	MUSTAFAR	PADME	14
110	Prequel(EP-4~6) Trilogy	MUSTAFAR	THREEPIO	3
111	Prequel(EP-4~6) Trilogy	NABOO SPACECRAFT	*EXTRA	12
112	Prequel(EP-4~6) Trilogy	NABOO SPACECRAFT	ANAKIN (VADER)	14
113	Prequel(EP-4~6) Trilogy	NABOO SPACECRAFT	CAPT	9
114	Prequel(EP-4~6) Trilogy	NABOO SPACECRAFT	JAR JAR	10
115	Prequel(EP-4~6) Trilogy	NABOO SPACECRAFT	OBI-WAN	10
116	Prequel(EP-4~6) Trilogy	NABOO SPACECRAFT	PADME	17
117	Prequel(EP-4~6) Trilogy	NABOO SPACECRAFT	QUI-GON	18
118	Prequel(EP-4~6) Trilogy	TATOOINE	*EXTRA	36
119	Prequel(EP-4~6) Trilogy	TATOOINE	ANAKIN (VADER)	27
120	Prequel(EP-4~6) Trilogy	TATOOINE	CAPT	6
121	Prequel(EP-4~6) Trilogy	TATOOINE	JAR JAR	1
122	Prequel(EP-4~6) Trilogy	TATOOINE	OBI-WAN	3
123	Prequel(EP-4~6) Trilogy	TATOOINE	PADME	18
124	Prequel(EP-4~6) Trilogy	TATOOINE	QUI-GON	12
125	Prequel(EP-4~6) Trilogy	TATOOINE	THREEPIO	19
126	Prequel(EP-4~6) Trilogy	THEED	*EXTRA	16
127	Prequel(EP-4~6) Trilogy	THEED	ANAKIN (VADER)	16
128	Prequel(EP-4~6) Trilogy	THEED	CAPT	2
129	Prequel(EP-4~6) Trilogy	THEED	EMPEROR	3
130	Prequel(EP-4~6) Trilogy	THEED	JAR JAR	8
131	Prequel(EP-4~6) Trilogy	THEED	MACE WINDU	2
132	Prequel(EP-4~6) Trilogy	THEED	NUTE	5
133	Prequel(EP-4~6) Trilogy	THEED	OBI-WAN	8
134	Prequel(EP-4~6) Trilogy	THEED	PADME	10
135	Prequel(EP-4~6) Trilogy	THEED	QUI-GON	12
136	Prequel(EP-4~6) Trilogy	THEED	YODA	1
137	Prequel(EP-4~6) Trilogy	TIPOCA CITY	*EXTRA	45
138	Prequel(EP-4~6) Trilogy	TIPOCA CITY	OBI-WAN	29
139	Prequel(EP-4~6) Trilogy	UTAPAU	*EXTRA	38
140	Prequel(EP-4~6) Trilogy	UTAPAU	OBI-WAN	18

trilogy의 인물, 배경¶

다양한 유사도 Matrix 출력¶

인물별 유사도
장면별 유사도
배경별 유사도

In [1999]:

from scipy.spatial import distance 

def cos_cdist(matrix, vector):
    """
    Compute the cosine distances between each row of matrix and vector.
    """
    v = vector.reshape(1, -1)
    return scipy.spatial.distance.cdist(matrix, v, 'cosine').reshape(-1)

def matrix_similarity( matrix_ds, dist='cosine'):
    
    sim_mat = []
    name = []
    for idx, row in matrix_ds.iterrows():
        name.append(idx)
        sim_mat.append(cos_cdist( matrix_ds, row ).tolist())
    ds = 1-pd.DataFrame(sim_mat, columns=name, index=name)
    return ds

In [2216]:

actor_ds = pd.pivot_table( sw_ds, index='top_actor', columns='totally_scene_no', values='type', aggfunc=len, fill_value=0)
ds = matrix_similarity( actor_ds )
sns.set(style="whitegrid", palette="pastel", color_codes=True,font_scale=1.2)
cmap = sns.cubehelix_palette(as_cmap=True, rot=-.3, light=1)
ax = sns.clustermap(ds, cmap=cmap, linewidths=.5)

In [2217]:

place_ds = pd.pivot_table( sw_ds, index='top_place', columns='actor', values='type', aggfunc=len, fill_value=0)
ds = matrix_similarity(place_ds)
cmap = sns.cubehelix_palette(as_cmap=True, rot=.3, light=1)
ax = sns.clustermap(ds, cmap=cmap, linewidths=.5 )

인물-배경의 network 그래프¶

In [2218]:

def extractNetwork( ds, name1, name2, min_edge=0, min_node=0):
    ## make graph data 
    edges = defaultdict(int)
    nodes = defaultdict(int)
    for (idx, row) in ds.iterrows():

        if row[name1].startswith('*') or row[name2].startswith('*'):
            continue

        ## sort ascending
        n1,n2 = (row[name1], row[name2]) if row[name1] < row[name2] else (row[name2], row[name1])
        edges[(n1,n2)] += 1
        nodes[n1] += 1
        nodes[n2] += 1
        
    nodes = { n:v for (n,v) in nodes.items() if v>min_node }
    edges = { (n1,n2):v for ((n1,n2),v) in edges.items() if v>min_edge and n1 in nodes and n2 in nodes }
    return (edges, nodes)

In [2286]:

def drawSWNetwork( edges, nodes, title, min_display=0 ) :
    # draw graph
    G=nx.Graph()
    for (n,v) in edges.items():
        G.add_edge(n[0],n[1],weight=v)

    pos=nx.spring_layout(G, k=0.3) # positions for all nodes
    plt.figure(3,figsize=(17,12))
    plt.xlim(-.02,1.02)
    plt.ylim(-.05,1.05)
    plt.title(title, fontsize=14)
    ax = nx.draw_networkx_nodes(G, pos,
                       nodelist=nodes.keys(),
                       node_color='rgbcmy',
                       node_size=[v*4 for v in nodes.values()],
                       alpha=0.8)

    nx.draw_networkx_edges(G, pos, alpha=0.4, edgelist=edges.keys(), width=[n/15 for n in edges.values()])
    
    
    labels = {} 
    for (n,v) in nodes.items():
        labels[n] =  '' if v<min_display else n.replace(' ','\n')
    ax = nx.draw_networkx_labels(G, pos, labels, font_size=12, font_color="k")
    #nx.draw(G, fontsize=10)

매개 중심성(betweeness centrality) 측정¶

In [2316]:

(edges, nodes) = extractNetwork(ds=sw_ds[sw_ds.episode>0], name1='actor', name2='place1', min_node=15)
G=nx.Graph()
for (n,v) in edges.items():
    G.add_edge(n[0],n[1],weight=v)

#dc = nx.betweenness_centrality(G)
#dc = nx.degree_centrality(G)
dc = nx.betweenness.betweenness_centrality(G)

import operator
sorted_x = sorted(dc.items(), key=operator.itemgetter(1))
sorted_x.reverse()


for n,c in sorted_x[:20]:
    print(n,'\t',round(c,3))

OBI-WAN 	 0.307
ANAKIN (VADER) 	 0.247
LUKE 	 0.117
THREEPIO 	 0.103
TATOOINE 	 0.102
PADME 	 0.092
MOS ESPA 	 0.06
DEATH STAR 	 0.053
CORUSCANT 	 0.045
QUI-GON 	 0.035
EMPEROR 	 0.035
MILLENNIUM FALCON 	 0.031
HAN 	 0.03
THEED 	 0.028
LEIA 	 0.027
YODA 	 0.024
TIPOCA CITY 	 0.023
SPACE 	 0.022
VADER'S STAR DESTROYER 	 0.02
NABOO EDGE OF SWAMP/ GRASS PLAINS 	 0.02

스타워즈 전체(EP1~6) 인물-배경 네크워크¶

In [2288]:

(edges, nodes) = extractNetwork(ds=sw_ds, name1='actor', name2='place1', min_node=15)
drawSWNetwork( edges, nodes, '', min_display=40)

스타워즈 프리퀄 트롤리지(Ep 1,2,3) 인물,배경 네트워크¶

In [2276]:

(edges, nodes) = extractNetwork(ds=sw_ds[sw_ds.episode<4], name1='actor', name2='place1', min_node=15)
drawSWNetwork( edges, nodes, '', min_display=40)

스타워즈 오리지날 트롤리지(Ep 4,5,6) 인물,배경 네트워크¶

In [2280]:

(edges, nodes) = extractNetwork(ds=sw_ds[sw_ds.episode>3], name1='actor', name2='place1', min_node=15)
drawSWNetwork( edges, nodes, '', min_display=40)

In [ ]: