%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import rcParams
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import math
## ploting theme
sns.set(style="whitegrid", palette="colorblind", color_codes=True, font_scale=1.4,
rc = {'font.size': 12, 'font.family':'NanumGothic'})
https://web.stanford.edu/~mwaskom/software/seaborn/
import random
data = np.array( [random.randrange(0,100) for x in range(100)])
print(data)
[99 80 46 76 42 95 78 72 27 85 74 96 2 62 85 22 85 64 91 34 62 40 23 80 86 65 42 14 92 99 20 88 47 54 70 28 29 22 50 10 65 14 4 46 52 1 31 89 50 29 41 52 14 82 69 92 34 92 5 19 71 31 13 79 74 30 98 4 49 95 1 69 9 67 80 81 58 9 15 21 28 27 84 71 72 62 71 89 77 83 2 79 21 62 99 3 91 88 62 47]
import seaborn as sns
sns.boxplot(data)
<matplotlib.axes._subplots.AxesSubplot at 0x10a00efd0>
sns.kdeplot(np.array(data))
<matplotlib.axes._subplots.AxesSubplot at 0x10a0b8630>
sns.violinplot(data)
<matplotlib.axes._subplots.AxesSubplot at 0x10a289518>
2014, 2015 나의 PC에서 사용하는 프로그램별 사용시간 데이터
!head ./resource/pc_usetime.csv
idx,uid,computername,shortsessionid,longsessionid,filename,title,catecode,usetime,jobclass,timestamp,inserttime,idate 1088,CHOIKYUMIN,CHOIKYUMIN,,,EXCEL.EXE,Microsoft Excel - 추천시스템 기능 기술 세분화.xlsx,6,270000,문서,1391404937,, 1089,CHOIKYUMIN,CHOIKYUMIN,,,chrome.exe,AfreecaTV Analytics :: Collaboration Search 검색어 - Chrome,2,5000,인터넷,1391404707,, 1090,CHOIKYUMIN,CHOIKYUMIN,,,chrome.exe,AfreecaTV Analytics :: Hot Boost Search Word - Chrome,2,5000,인터넷,1391404702,, 1091,CHOIKYUMIN,CHOIKYUMIN,,,chrome.exe,AfreecaTV Analytics :: Hot Broadcast by StarBalloon Cnt - Chrome,2,5000,인터넷,1391404712,, 1092,CHOIKYUMIN,CHOIKYUMIN,,,chrome.exe,AfreecaTV Analytics :: Starballoon Statistics - Chrome,2,15000,인터넷,1391404827,, 1093,CHOIKYUMIN,CHOIKYUMIN,,,chrome.exe,AfreecaTV Analytics :: pilot project~ - Chrome,2,40000,인터넷,1391404732,, 1094,CHOIKYUMIN,CHOIKYUMIN,,,chrome.exe,마이테이블 :: Pearson 상관계수(sample correlation coefficient) - Chrome,2,130000,인터넷,1391405107,, 1095,CHOIKYUMIN,CHOIKYUMIN,,,chrome.exe,"상대거리 계산 - 피타고라스 정리, 유클리드 거리 공식 : 네이버 블로그 - Chrome",2,30000,인터넷,1391405197,, 1096,CHOIKYUMIN,CHOIKYUMIN,,,chrome.exe,새 탭 - Chrome,2,35000,인터넷,1391405162,,
total_ds = pd.DataFrame.from_csv("./resource/pc_usetime.csv",index_col=None)
## type casting float to int
total_ds[['usetime','timestamp']] = total_ds[['usetime','timestamp']].fillna(0).astype(int)
total_ds[13000:].head(2).T
13000 | 13001 | |
---|---|---|
idx | 21244 | 21245 |
uid | CHOIKYUMIN | CHOIKYUMIN |
computername | CHOIKYUMIN | CHOIKYUMIN |
shortsessionid | 1394520453.0 | 1394520453.0 |
longsessionid | 1.394511e+09 | 1.394511e+09 |
filename | OUTLOOK.EXE | chrome.exe |
title | 받은 편지함 - goodvc@afreecatv.com - Microsoft Outlook | AfreecaTV Analytics :: Related Word Test Admin... |
catecode | 7 | 2 |
usetime | 20000 | 160000 |
jobclass | 기타업무 | 인터넷 |
timestamp | 1394520603 | 1394520823 |
inserttime | 1.394521e+09 | 1.394521e+09 |
idate | 2014-03-11 15:53:43 | 2014-03-11 15:53:43 |
## dataset 펼치기
data_arrays = []
for (idx, row) in total_ds.iterrows():
for ut in range(0,int(row.usetime),5000):
ts = row.timestamp + round(ut/1000)
now = datetime.fromtimestamp(ts)
days = (now - datetime(now.year, 1, 1)).days+1
data_arrays.append(['total', ts, now, now.year ,days, row.filename, row.jobclass])
## pandas 객체 생성
fully_expended_ds = pd.DataFrame(data_arrays, columns=['total', 'ts', 'time', 'year', 'days', 'filename', 'jobclass'])
## 주요 필드 만들기
## quater label
fully_expended_ds['YYYYQt'] = fully_expended_ds.time.apply(lambda x : "%d' %dQ" % (x.year-2000, x.quarter))
## day trend values
fully_expended_ds['day-minute'] = fully_expended_ds.time.apply(lambda x : x.hour*60+x.minute)
## hour label
fully_expended_ds['hour'] = fully_expended_ds.time.apply(lambda x : "%dh" % (math.ceil((x.hour+1)/3)*3 ) )
## month label
fully_expended_ds['month'] = fully_expended_ds.time.apply(lambda x : x.month)
## week label
weekday_str = '월 화 수 목 금 토 일'.split()
fully_expended_ds['weekday'] = fully_expended_ds.time.apply(lambda x : weekday_str[x.weekday()])
## app rank
apps_stat = total_ds.groupby(['filename']).sum().sort(['usetime'], ascending=False).reset_index()[['filename','usetime']]
apps_stat['rank'] = range(1,apps_stat.shape[0]+1)
fully_expended_ds = pd.merge(fully_expended_ds, apps_stat, on='filename')
fully_expended_ds.head()
total | ts | time | year | days | filename | jobclass | YYYYQt | day-minute | hour | month | weekday | usetime | rank | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | total | 1391404937 | 2014-02-03 14:22:17 | 2014 | 34 | EXCEL.EXE | 문서 | 14' 1Q | 862 | 15h | 2 | 월 | 434725000 | 6 |
1 | total | 1391404942 | 2014-02-03 14:22:22 | 2014 | 34 | EXCEL.EXE | 문서 | 14' 1Q | 862 | 15h | 2 | 월 | 434725000 | 6 |
2 | total | 1391404947 | 2014-02-03 14:22:27 | 2014 | 34 | EXCEL.EXE | 문서 | 14' 1Q | 862 | 15h | 2 | 월 | 434725000 | 6 |
3 | total | 1391404952 | 2014-02-03 14:22:32 | 2014 | 34 | EXCEL.EXE | 문서 | 14' 1Q | 862 | 15h | 2 | 월 | 434725000 | 6 |
4 | total | 1391404957 | 2014-02-03 14:22:37 | 2014 | 34 | EXCEL.EXE | 문서 | 14' 1Q | 862 | 15h | 2 | 월 | 434725000 | 6 |
import seaborn as sns
# plot style
sns.set(style="whitegrid", palette="colorblind", font_scale=1.4, rc={'font.family':'NanumGothic'} )
sns.violinplot(data=fully_expended_ds, x='days',y='total', hue='year', split=True )
<matplotlib.axes._subplots.AxesSubplot at 0x1317f52e8>
plt.figure(figsize=(12,8))
sns.set(style="whitegrid", palette="colorblind", color_codes=True, font_scale=1.4,
rc = {'font.size': 12, 'font.family':'NanumGothic'})
## ploting
g = sns.violinplot(data=fully_expended_ds, x='days', y='total', hue='year'
, scale="width", orient='h', split=True, cut=2 )
## draw x-ticks
ticks = fully_expended_ds.groupby('month').max()[['days']]
plt.xticks( ticks.days.tolist(), [ "%d월" % m for m in ticks.index.tolist()] )
plt.xlabel('')
## set x-axis range
plt.xlim(-50, 400)
plt.show()
tmp_ds = fully_expended_ds[fully_expended_ds['year']==2015].groupby(['YYYYQt','days']).count()[['usetime']].reset_index()
tmp_ds = tmp_ds[tmp_ds['usetime']>12*120]
(tmp_ds.groupby('YYYYQt').mean()[['usetime']]*5/60/60).T
YYYYQt | 15' 1Q | 15' 2Q | 15' 3Q | 15' 4Q |
---|---|---|---|---|
usetime | 6.987449 | 6.924741 | 6.291538 | 5.447402 |
sns.violinplot(data=fully_expended_ds[fully_expended_ds['rank']<11], x='days', y='filename' )
<matplotlib.axes._subplots.AxesSubplot at 0x10b068710>
import seaborn as sns
## 다듬어진 Violin Plotting하기
def drawViolin(ds, x, y, hue, label=None, figsize=(14,50), order=None, scale='width') :
sns.set(style="whitegrid", palette="colorblind", font_scale=1.4, rc={'font.family':'NanumGothic'} )
plt.figure(figsize=figsize)
order_list = order or ds.groupby(y).count().sort(x, ascending=False).index.tolist()
## ploting
g = sns.violinplot(data=ds, x=x, y=y, hue=hue, scale=scale, orient='h'
, cut=2, split=True, inner='box'
, order = order_list
)
plt.tick_params(labeltop='on')
if label != None:
## x ticks
label_ds = ds.groupby(label).max()
x_index = label_ds[x].values.tolist()
x_label = label_ds.index.tolist()
plt.xticks(x_index, x_label, rotation='vertical')
plt.xlabel('')
plt.ylabel('')
drawViolin(fully_expended_ds[fully_expended_ds['rank']<11], x='ts', y='filename', hue=None, label='YYYYQt', figsize=(14,20))
drawViolin(fully_expended_ds, x='ts', y='jobclass', hue=None, label='YYYYQt', figsize=(14,20))
drawViolin(fully_expended_ds, x='day-minute', y='total', hue='year', label='hour', figsize=(14,6))
weekday_str = '월 화 수 목 금'.split()
drawViolin(fully_expended_ds[fully_expended_ds['weekday'].isin(weekday_str)], x='day-minute', y='weekday'
, hue='year', label='hour', figsize=(14,20), order=weekday_str, scale='count')