우선 나의 outlook에 있는 이메일에서 보낸사람, 받는사람, 보낸시간, 제목의 정보만 file로 다운로드를 하자. outlook에는 기본적인 내보내기를 제공하는데 이 내보내기 기능에서는 보낸시간 정보 필드를 추가 할 수 없기 때문에 어쩔수 없이 보낸 시간까지 파일로 내보낼수 있는 서드 파티 툴 인 Codetwo(http://www.codetwo.com/freeware/outlook-export/)%EB%A5%BC 이용하였다. 무료 버젼이 사용이 편리해 사용을 권장하 싶기도 하다.
# coding: utf-8
import pandas as pd
import numpy as np
from matplotlib import rcParams
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import networkx as nx
from datetime import datetime
import gensim, logging
import matplotlib.patches as mpatches
%matplotlib inline
rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
sns.set_style("whitegrid")
rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
#data load
recv_filepath = "/Users/goodvc/Documents/data/goodvc-mail-data/recv-email-2015-03-04.csv"
recv_ds = pd.read_csv(recv_filepath)
recv_ds.columns = ['timestamp', 'from', 'cc', 'to', 'subject']
recv_ds.head(1)
sent_filepath = "/Users/goodvc/Documents/data/goodvc-mail-data/sent-email-2015-07-11.csv"
sent_ds = pd.read_csv(sent_filepath)
sent_ds.columns = ['timestamp', 'from', 'cc', 'to', 'subject']
sent_ds[20:].head(2)
timestamp | from | cc | to | subject | |
---|---|---|---|---|---|
20 | 2015-04-29 오후 3:20:33 | 최 규민 | 봉 승태 | 김 종승 | RE: 기술연구소 공채 - 기술연구소 2015년 OJT 강사 |
21 | 2015-04-29 오후 3:09:40 | 최 규민 | 김 종승;고 광표;한 규하;정 진호;봉 승태;김 래선 | 정 진호 | RE: [공유] 스포츠 영상 검색 관련 내용 |
# timestamp type을 datetime으로 변환하자
def str2datetime(s):
return datetime.strptime( s.replace('오후','pm').replace('오전','am')
, '%Y-%m-%d %p %I:%M:%S' )
recv_ds['timestamp'] = recv_ds['timestamp'].apply( str2datetime )
sent_ds['timestamp'] = sent_ds['timestamp'].apply( str2datetime )
#senttime의 type이 datetime으로 변환되었다.
recv_ds.dtypes
timestamp datetime64[ns] from object cc object to object subject object dtype: object
# 데이터를 추가 가공
# timestamp date, time으로 분리해서 처리하자
recv_ds['hour'] = recv_ds['timestamp'].apply( lambda x : x.hour+ x.minute/60)
sent_ds['hour'] = sent_ds['timestamp'].apply( lambda x : x.hour+ x.minute/60)
recv_ds['date'] = recv_ds['timestamp'].apply(datetime.date)
sent_ds['date'] = sent_ds['timestamp'].apply(datetime.date)
sent_ds = sent_ds[sent_ds.timestamp < '2015-03-04 16:08:22' ]
recv_ds = recv_ds[recv_ds.timestamp < '2015-03-04 16:08:22' ]
mail_ds = pd.concat([recv_ds, sent_ds])
# name-info data load
""" ### name-info.csv file sample
name,team,color,position,display
안 찬호,사업본부,m,NONE,ach
안 현주,기술연구소,b,NONE,ahj
"""
name_filepath = "/Users/goodvc/Documents/data/goodvc-mail-data/name-info.csv"
name_ds = pd.read_csv(name_filepath)
name_ds.index = name_ds['name'].values
name_dic = name_ds.to_dict()
name_ds.head(2)
name | team | color | position | display | |
---|---|---|---|---|---|
안 찬호 | 안 찬호 | 사업본부 | m | NONE | ach |
안 현주 | 안 현주 | 기술연구소 | b | NONE | ahj |
plt.figure(figsize=(12,6))
plt.scatter( recv_ds.date.tolist(), recv_ds.hour.tolist(), s=5, alpha=0.5, color='r', label='받은메일' )
plt.scatter( sent_ds.date.tolist(), sent_ds.hour.tolist(), s=5, alpha=0.5, color='b', label='보낸메일' )
plt.ylim(0,24)
plt.ylabel('Hour(0-24)')
plt.title("주고 받은 메일 산점도",fontsize=14)
plt.legend(scatterpoints=3,
loc='lower right',
ncol=3,
fontsize=14, markerscale = 3)
plt.show()
mail_ds.groupby('date').count()['timestamp'].plot(kind='line',figsize=(12,3))
plt.title('일일 메일 건수')
plt.show()
tmp_ds = mail_ds.groupby('date').count()
tmp_ds['date'] = tmp_ds.index
tmp_ds.index = np.arange(0,tmp_ds.shape[0])
tmp_ds['idx'] = np.arange(0,tmp_ds.shape[0])
plt.figure(figsize=(8,4))
g = sns.lmplot(x="idx", y="timestamp", data=tmp_ds, size=5, y_jitter=0.1, aspect=2,lowess=True )
g.set(ylim=(-.05, 50))
g.set(xlim=(-.05, 802))
g.set_ylabels('메일 송수신건수(건)')
g.set_xlabels('경과일(days)')
plt.show()
<matplotlib.figure.Figure at 0x102712320>
# 주고받은 사람수 추가
mail_ds['users'] = mail_ds.apply(lambda x : len(str(x['from']).split(';'))+len(str(x['to']).split(';')),axis=1)
daily_users = []
for day, row in mail_ds.groupby('date'):
users = {}
messages = 0
for idx, rr in row.iterrows():
users[rr['from']]=1
for s in str(rr['to']).split(';'):
users[s]=1
messages += 1
daily_users.append([day, len(users), messages])
daily_users=pd.DataFrame(daily_users,columns=['date','users','messages'])
#fig, ax = plt.subplots()
#ax.plot(daily_users['date'], daily_users['messages'],'ro')
#ax.plot(daily_users['date'], daily_users['users'],'ko')
daily_users['idx'] = daily_users.index
plt.figure(figsize=(8,4))
g = sns.lmplot(x="idx", y="users", data=daily_users, size=5, y_jitter=0.1, aspect=2,lowess=True ,color='r')
g.set(ylim=(-.05, 50))
g.set(xlim=(-.05, 802))
g.set_ylabels('일일 유니크 사람수(명)')
g.set_xlabels('경과일(days)')
#ax.plot(daily_users['date'], daily_users['messages'], 'ro')
#ax.plot(x, y2, 'ro')
plt.show()
<matplotlib.figure.Figure at 0x10a0848d0>
daily_users['year'] = daily_users['date'].apply(lambda x: x.year)
g = sns.lmplot(x="messages", y="users", data=daily_users, col='year',
palette=dict(Yes="g", No="m"), aspect=.5, x_jitter=.1, y_jitter=.2 )
g.set(ylim=(-.05, 50)).set(xlim=(-.05, 51))
<seaborn.axisgrid.FacetGrid at 0x10a047d68>
## 메시지수와 받는 사람수간의 joint 분포
g = sns.jointplot(x="messages", y="users", data=daily_users,kind="reg")
## day
sns.set_style("darkgrid")
rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
def distplot(ds, col, title=''):
plt.figure(figsize=(8,4))
sns.distplot (ds[col], bins=24 )
plt.xlim(0,24)
skew = ds[(ds[col]>8) & (ds[col]<19 )][col].skew()
skew = "skew = {0}".format(round(skew, 4))
plt.text(1,0.1,skew, fontsize=14)
plt.title(title, fontdict = {'fontsize': 18} )
plt.show()
distplot(recv_ds, 'hour', '받은 E-mail 수신 시간 분포')
distplot(sent_ds, 'hour', '보낸 E-mail 송신 시간 분포')
distplot(mail_ds, 'hour', '전체 E-mail 시간 분포')
즉 방향성이 있는 network 그래프를 통해 이메일을 주고 받는 사람간의 관계를 찾아 볼수 있을것 같다.
# network 형태의 데이터를 만들기위해 이메일의 from, to로 부터 edge,node를 만든다.
def toDisplay(name, item='name', default=False):
r = name_dic[item].get(name, default)
if r==False:
return False
return r.upper()
def makeNetworkInfo(mail_ds):
edges = defaultdict(int)
nodes = defaultdict(int)
for (idx,row) in mail_ds.iterrows():
f = toDisplay(row['from'])
if f == False:
continue
t = row['to']
if t!=t or f!=f: #nan check
continue
for tt in t.split(';'):
tt = toDisplay(tt)
if tt == False:
continue
edges[(f,tt) if f<tt else (f,tt)] += 1
nodes[tt] += 1
return (edges, nodes)
recv_dic = makeNetworkInfo(recv_ds)
sent_dic = makeNetworkInfo(sent_ds)
mail_dic = makeNetworkInfo(mail_ds)
def drawTops( dic_data, topn=20, title='' ):
tmp_ds = pd.DataFrame(list(dic_data.items()), columns=['이름', 'count'])
tmp_ds.index = tmp_ds['이름'].apply( lambda x: toDisplay(x,item='display') )
tmp_ds.sort(['count'], ascending=False)[:20].plot(kind='bar', fontsize=13)
plt.title(title, fontsize=14)
drawTops(recv_dic[1],20, "받은 이메일 상위 랭커 20")
drawTops(sent_dic[1],20, "보낸 이메일 상위 랭커 20")
drawTops(mail_dic[1],20, "주고/받는 이메일 상위 랭커 20")
NetworkX 모듈로 Network graph를 그려 보면.. 짜잔
def drawNetwork( nodes, edges, title='', distance=0.8 , legend=[]):
G=nx.Graph()
plt.figure(figsize=(12,12))
plt.title(title, fontsize=14)
edge_list = []
for (k,v) in edges.items():
if k[0] in nodes and k[1] in nodes:
G.add_edge(k[0],k[1],weight=v)
edge_list = [(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] > 5]
colors= ([ toDisplay(u,item='color',default='k') for u in nodes.keys() ])
pos=nx.spring_layout(G, k=distance) # positions for all nodes
maxSize = max(nodes.values())
nx.draw_networkx_nodes(G,pos,
nodelist=nodes.keys(),
node_color=colors,
node_size=[(v/maxSize)*4000 for v in nodes.values()],
alpha=0.5)
nx.draw_networkx_edges(G, pos, alpha=0.7,edgelist=edge_list)
labels = {}
cnt = 0
for n in nodes.keys():
labels[n]= toDisplay(n,item='display',default='unknown')
cnt += 1
nx.draw_networkx_labels(G,pos,labels,font_size=14,font_color="k")
plt.title(title, fontsize=14)
## add patches
patches = []
for (name, color) in legend:
patches.append( mpatches.Patch(color=color, label=name, alpha=0.7) )
plt.legend(handles = patches, fontsize=14)
plt.show()
# make legend by colors
legend = []
for name, row in name_ds.groupby('team').max().iterrows():
legend.append((name, row['color']))
drawNetwork( recv_dic[1], recv_dic[0], "받은 메일 Network Graph", 0.4, legend)
drawNetwork( sent_dic[1], sent_dic[0], "보낸 메일 Network Graph", 0.3, legend)
drawNetwork( mail_dic[1], mail_dic[0], "전체 메일 Network Graph", 0.4, legend)
## 메일제목으로 하나의 쓰레드를 형성
clean_func = lambda x: str(x).replace('RE: ','',1).replace('Fwd: ','',1).replace('FW:','',1).replace(' ','')
recv_ds['clean_title'] = recv_ds['subject'].apply( clean_func )
recv_ds['dir'] = 'recv'
sent_ds['clean_title'] = sent_ds['subject'].apply( clean_func )
sent_ds['dir'] = 'sent'
email_ds = pd.concat([recv_ds, sent_ds]).sort(['clean_title','timestamp'])
def whoResponse(pre,nex):
direction = 'other'
if pre.dir=='sent':
direction = 'recv-time'
elif nex.dir=='sent':
direction = 'sent-time'
rtt = (nex.timestamp-pre.timestamp).total_seconds()
if rtt>(7*24*60*60):
rtt = (7*24*60*60)
return (direction, rtt)
rtt_dic = defaultdict(list)
for t, grp in email_ds[:100000].groupby('clean_title'):
old = 0
for idx, row in grp.iterrows():
#print ("%s %s-%s (%s)" % (row['clean_title'], row['from'], row.to, row['dir']))
if type(old) != int:
direction, sec = whoResponse(old,row)
if sec>0:
rtt_dic[direction].append(sec/60/60)
#print (direction,sec)
old = row
s_rtt = pd.DataFrame(rtt_dic['sent-time'], columns=['time'])
r_rtt = pd.DataFrame(rtt_dic['recv-time'], columns=['time'])
sns.set_style("whitegrid")
rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
def drawRTT( ds, title ):
ax = sns.distplot(ds,bins=160, color='r')
ax.set_xlim(0,170)
median = ds.median()
text = "median = %.2f 시간" % (median)
ax.text(80, .2, text, fontsize=14)
ax.set_title(title, fontsize=14)
ax.set_xlabel('회신시간(Hour)')
plt.show()
drawRTT(s_rtt,'받은메일 나의 회신시간')
drawRTT(r_rtt,'보낸메일 회신 받은 시간')
word2vec으로 대상 추출하기
import re
def extractValues(ds, cols, arr=[],seps='[;]'):
for idx, row in ds.iterrows():
sub = []
for val in row[cols].values:
if val!=val:
continue
sub.extend(re.split( seps, val ) )
sub.extend(str(row['subject']).split())
arr.append(sub)
email_arr = []
extractValues(recv_ds, ['to','from'], email_arr)
extractValues(sent_ds, ['from','to'], email_arr)
email_model = gensim.models.Word2Vec(email_arr, min_count=5, window=10,size=100 )
email_vt = email_model.syn0
from scipy.cluster.vq import vq, kmeans2, whiten
cluster = kmeans2(email_vt, 30)
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/scipy/cluster/vq.py:600: UserWarning: One of the clusters is empty. Re-run kmean with a different initialization. warnings.warn("One of the clusters is empty. "
def cid2nameDic( c ):
clusterDic = {}
inc = 0
for cid in c:
if cid in clusterDic:
clusterDic[cid].append( email_model.index2word[inc] )
else :
clusterDic[cid] = [email_model.index2word[inc]]
inc += 1
return clusterDic
clusters = cid2nameDic( cluster[1] )
email_model.most_similar(positive=['이 윤선'],topn=20)
[('방송_', 0.930067777633667), ('임 지윤', 0.9222908020019531), ('실섭확인', 0.9220837354660034), ('종료', 0.9135299324989319), ('팝업', 0.9120415449142456), ('_', 0.8800953030586243), ('한 태용', 0.877161979675293), ('추천', 0.8609453439712524), ('스토리보드', 0.8379392623901367), ('홍 규희', 0.8009618520736694), ('맞춤방송', 0.7413254976272583), ('AOS', 0.7339004278182983), ('바이너리', 0.7262791991233826), ('문 병준', 0.7183706760406494), ('시안', 0.7115654349327087), ('추천방송', 0.7075629830360413), ('Live', 0.7027546167373657), ('연관', 0.6971281170845032), ('맞춤', 0.6919479370117188), ('관련건', 0.6887381672859192)]