브런치 작가를 추천합니다.
http://stackoverflow.com/questions/12519074/scrape-websites-with-infinite-scrolling
import urllib
import json
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import seaborn as sns
import sys
import time
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline
# brunch data crawling by 셀레니엄
# source reference : http://stackoverflow.com/questions/12519074/scrape-websites-with-infinite-scrolling
def crawlBrunchLink(uid, dir='follower'):
## html crawling
driver = webdriver.Firefox()
#driver.implicitly_wait(30)
base_url = "https://brunch.co.kr/@{uid}/{dir}".format(uid=uid, dir=dir)
verificationErrors = []
accept_next_alert = True
delay = 3
driver.get(base_url)
#driver.find_element_by_link_text("All").click()
htmlsize = 0
keep_cnt = 0
for i in range(1,100):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.1)
if htmlsize == len(driver.page_source):
keep_cnt += 1
else :
keep_cnt = 0
htmlsize = len(driver.page_source)
if keep_cnt > 5 :
break
html_source = driver.page_source
## extract follower, following data
data = html_source.encode('utf-8')
soup = BeautifulSoup(data, 'html.parser')
classes = soup.find_all("a", class_="link_follow")
idlist = []
for c in classes:
follwing = c.get('href')
if follwing is None or len(follwing)==0:
continue
idlist.append(follwing[2:])
driver.close()
return idlist
## 관심 작가 정보 크롤링 하기
## extract Brunch Writer Info : uid, name, text-count, megazine-count, follower-count, following-count:
def extractWriterInfo(uid):
try:
response = urllib.request.urlopen("https://brunch.co.kr/@{uid}".format(uid=uid) )
except Exception:
[]
data = response.read().decode('utf-8')
soup = BeautifulSoup(data, 'html.parser')
## name
names = soup.find_all("strong", class_="profileUserName")
name = uid if len(names)<1 else names[0].getText()
classes = soup.find_all("span", class_="num_count")
reserved = [uid, name]
for c in classes:
reserved.append(int(c.getText()))
return reserved
extractWriterInfo('goodvc78')
['goodvc78', '최규민', 2, 0, 106, 7]
my_brunch_id = 'goodvc78'
## my follower crawl
my_follower = crawlBrunchLink(my_brunch_id,dir='follower')
print( "my followers ", len(my_follower) )
my followers 106
## 구독자의 기본 정보 크롤링
basic_info = []
for uid in my_follower:
basic_info.append( extractWriterInfo(uid) )
writer_info_ds = pd.DataFrame(basic_info, columns=['uid','name','texts', 'megazines', 'followers', 'followings' ])
## start crawling
networks = {}
for n in my_follower:
followers = crawlBrunchLink( n, 'follower' )
followings = crawlBrunchLink( n, 'following' )
networks[n] = { 'follower': followers, 'following': followings }
# data set 만들기
data = []
for n, f in networks.items():
for nn in f['following']:
data.append([n,'following',nn])
for nn in f['follower']:
data.append([n,'follower',nn])
net_ds = pd.DataFrame(data, columns=['uid','dir','target'])
net_ds.head()
uid | dir | target | |
---|---|---|---|
0 | cycy2222 | following | goodvc78 |
1 | cycy2222 | following | brunch |
2 | sufeel97 | following | sensee |
3 | sufeel97 | following | gahye |
4 | sufeel97 | following | goodvc78 |
## 관심 작가 통계
net_ds[net_ds.dir=='following'].groupby(['uid']).count().describe()[:4]
dir | target | |
---|---|---|
count | 106.000000 | 106.000000 |
mean | 21.933962 | 21.933962 |
std | 33.134723 | 33.134723 |
min | 2.000000 | 2.000000 |
## 구독자 통계
net_ds[net_ds.dir=='follower'].groupby(['uid']).count().describe()[:4]
dir | target | |
---|---|---|
count | 40.000000 | 40.000000 |
mean | 40.375000 | 40.375000 |
std | 140.239847 | 140.239847 |
min | 1.000000 | 1.000000 |
# 상위 작가 3명의 구독자수
top3_follower = net_ds[net_ds.dir=='follower'].groupby(['uid']).count().sort('dir', ascending=False)[:3].sum()[0]
print ('상위작가 3명 구독자수',top3_follower )
total_follower = net_ds[net_ds.dir=='follower'].count()[0]
print ('전체작가 구독자수', total_follower)
print ('상위작가 3명의 점유율', top3_follower / total_follower)
상위작가 3명 구독자수 1457 전체작가 구독자수 1615 상위작가 3명의 점유율 0.902167182663
## filtering my-id and official-id ( goodvc78, brunch )
net_ds = net_ds[~net_ds.target.isin(['goodvc78','brunch'])]
## 상위 50 작가 선정
topn=50
top_writer = net_ds[net_ds.dir=='following'].groupby('target').count().sort('uid', ascending=False)[:topn][['dir']]
top_writer = top_writer.reset_index()
top_writer.columns = ['uid','나의 구독자수']
## 상위 작가의 bruch 정보 가져오기 ( 크롤링 )
top_writer_info = []
for uid in top_writer.uid:
top_writer_info.append(extractWriterInfo(uid))
top_writer_info = pd.DataFrame(top_writer_info, columns=['uid','작가명','글 수', '매거진수', '전체 구독자수', '관심작가수' ])
## 데이터 머지
top_writer = pd.merge(top_writer, top_writer_info, on=['uid'])
## 전체 구독수대비 나의 구독자수 비율
top_writer['나의 구독자수 비율'] = top_writer['나의 구독자수'] / (top_writer['전체 구독자수'] + 10)
top_writer['나의 구독자수 비율'] = top_writer['나의 구독자수 비율'].apply(lambda x: round(x,3))
def rankBy( ds, colname, topn=10 ):
ranked = ds.sort(colname, ascending=False)[['uid','작가명','나의 구독자수','전체 구독자수', '나의 구독자수 비율']][:topn]
for (name,row) in ranked.iterrows():
print ("https://brunch.co.kr/@{name}".format(name=row['uid'], value=row['작가명']))
return ranked
#rankBy( top_writer, '나의 구독자수')
rankBy( top_writer, '나의 구독자수')
https://brunch.co.kr/@lifidea https://brunch.co.kr/@brunchflgu https://brunch.co.kr/@suyoung https://brunch.co.kr/@yoojs8512 https://brunch.co.kr/@jimmyrim https://brunch.co.kr/@insuk https://brunch.co.kr/@suhanjang https://brunch.co.kr/@haneulalice https://brunch.co.kr/@pelexus https://brunch.co.kr/@sooscape
uid | 작가명 | 나의 구독자수 | 전체 구독자수 | 나의 구독자수 비율 | |
---|---|---|---|---|---|
0 | lifidea | Jin Young Kim | 32 | 734 | 0.043 |
1 | brunchflgu | 조우성 변호사 | 32 | 5008 | 0.006 |
2 | suyoung | 강수영 | 24 | 1221 | 0.019 |
3 | yoojs8512 | 유재석 | 24 | 1433 | 0.017 |
4 | jimmyrim | 임지훈 Jimmy Rim | 23 | 2423 | 0.009 |
5 | insuk | 조인석 chris | 21 | 913 | 0.023 |
6 | suhanjang | 티거 Jang | 20 | 4879 | 0.004 |
7 | haneulalice | Alice in wonderland | 18 | 2432 | 0.007 |
8 | pelexus | 최윤섭 | 17 | 580 | 0.029 |
9 | sooscape | 흔디 | 17 | 1705 | 0.010 |
rankBy( top_writer, '나의 구독자수 비율')
https://brunch.co.kr/@cojette https://brunch.co.kr/@aidenswmo https://brunch.co.kr/@ajmind https://brunch.co.kr/@manya https://brunch.co.kr/@genie7pe https://brunch.co.kr/@jaeseungmun https://brunch.co.kr/@cloud09 https://brunch.co.kr/@koreajb https://brunch.co.kr/@head77x https://brunch.co.kr/@lifidea
uid | 작가명 | 나의 구독자수 | 전체 구독자수 | 나의 구독자수 비율 | |
---|---|---|---|---|---|
27 | cojette | cojette | 11 | 78 | 0.125 |
40 | aidenswmo | 모상우 | 9 | 66 | 0.118 |
46 | ajmind | ajmind | 8 | 129 | 0.058 |
13 | manya | 마냐 | 15 | 282 | 0.051 |
36 | genie7pe | Chulhyun Cho | 10 | 192 | 0.050 |
35 | jaeseungmun | Jaeseung Mun | 10 | 202 | 0.047 |
47 | cloud09 | 강한별 | 8 | 159 | 0.047 |
45 | koreajb | JB | 9 | 186 | 0.046 |
48 | head77x | Brandon Chung | 8 | 166 | 0.045 |
0 | lifidea | Jin Young Kim | 32 | 734 | 0.043 |
rankBy( top_writer, '전체 구독자수')
https://brunch.co.kr/@brunchflgu https://brunch.co.kr/@suhanjang https://brunch.co.kr/@captaink https://brunch.co.kr/@rothem https://brunch.co.kr/@haneulalice https://brunch.co.kr/@jimmyrim https://brunch.co.kr/@2kija https://brunch.co.kr/@sooscape https://brunch.co.kr/@yoonash https://brunch.co.kr/@yoojs8512
uid | 작가명 | 나의 구독자수 | 전체 구독자수 | 나의 구독자수 비율 | |
---|---|---|---|---|---|
1 | brunchflgu | 조우성 변호사 | 32 | 5008 | 0.006 |
6 | suhanjang | 티거 Jang | 20 | 4879 | 0.004 |
31 | captaink | 캡틴K | 10 | 3697 | 0.003 |
25 | rothem | Rothem | 11 | 2680 | 0.004 |
7 | haneulalice | Alice in wonderland | 18 | 2432 | 0.007 |
4 | jimmyrim | 임지훈 Jimmy Rim | 23 | 2423 | 0.009 |
44 | 2kija | 이기주작가 | 9 | 2118 | 0.004 |
9 | sooscape | 흔디 | 17 | 1705 | 0.010 |
14 | yoonash | yoonash | 15 | 1457 | 0.010 |
3 | yoojs8512 | 유재석 | 24 | 1433 | 0.017 |
조금 약한 개인화 + 은둔 고수의 컨텐츠 발견 기법이 적절히 조합이 되면 근사한 추천 시스템이 될수 있을듯하다. 유저의 서비스 라이프 사이클(처음 사용자, 중급, 고급 사용자 )
좀더 일반적으로 아주 고퀄의 컨텐츠는 관심사가 아니라도 읽을만한 가치가 된다.
## 작가 필터링 관심 작가로 3명이상 등록한 작가에 대해서만 분석
writers = net_ds.groupby(['target']).count().sort('uid', ascending=False)
writers_list = writers[writers.dir>2].index.tolist()
## 관심 작가수
writers[writers.dir>2].groupby('dir').count().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x10b079a90>
## 작가 필터링 관심 작가로 3명이상 등록한 작가만 필터링
net_ds = net_ds[net_ds.target.isin(writers_list)]
## following(관심작가) 정보만 사용
net_ds = net_ds[net_ds.dir=='following']
## 구독자별 following 리스트 사이에는 연관성이 있을 것으로 판단하여 리스트내의 작가끼리 모드 연결되도록 link를 만듬
links = []
for (name, following) in net_ds.groupby('uid'):
list1 = following.target.values
seq = 1
for n1 in list1:
for n2 in list1[seq:]:
links.append([n1,n2])
seq += 1
writer_ds = pd.DataFrame(links,columns=['node1','node2'])
import networkx as nx
from collections import defaultdict
def extractNetwork( ds, name1, name2, min_edge=0, min_node=0):
## make graph data
edges = defaultdict(int)
nodes = defaultdict(int)
for (idx, row) in ds.iterrows():
if row[name1].startswith('*') or row[name2].startswith('*'):
continue
## sort ascending
n1,n2 = (row[name1], row[name2]) if row[name1] < row[name2] else (row[name2], row[name1])
edges[(n1,n2)] += 1
nodes[n1] += 1
nodes[n2] += 1
nodes = { n:v for (n,v) in nodes.items() if v>min_node }
edges = { (n1,n2):v for ((n1,n2),v) in edges.items() if v>min_edge and n1 in nodes and n2 in nodes }
tmp_nodes = set()
for (n1,n2) in edges.keys():
tmp_nodes.add(n1)
tmp_nodes.add(n2)
nodes = { n:v for (n,v) in nodes.items() if n in tmp_nodes }
return (edges, nodes)
def drawSWNetwork( edges, nodes, title, min_display=0 ) :
sns.set(style="whitegrid", palette="pastel", color_codes=True,font_scale=1.4)
# draw graph
G=nx.Graph()
for (n,v) in edges.items():
G.add_edge(n[0],n[1],weight=v)
pos=nx.spring_layout(G, k=0.4) # positions for all nodes
plt.figure(3,figsize=(17,12))
plt.xlim(-.05,1.05)
plt.ylim(-.05,1.05)
plt.title(title, fontsize=14)
ax = nx.draw_networkx_nodes(G, pos,
nodelist=nodes.keys(),
node_color='rgbcmy',
node_size=[v*2 for v in nodes.values()],
alpha=0.8)
nx.draw_networkx_edges(G, pos, alpha=0.4, edgelist=edges.keys(), width=[(n/6)**2 for n in edges.values()])
labels = {}
for (n,v) in nodes.items():
labels[n] = '' if v<min_display else n.replace(' ','\n')
ax = nx.draw_networkx_labels(G, pos, labels, font_size=12, font_color="k")
#nx.draw(G, fontsize=10)
## network display
(edges, nodes) = extractNetwork(ds=writer_ds, name1='node1', name2='node2',min_edge=5, min_node=150)
drawSWNetwork( edges, nodes, '', min_display=10)
import operator
def centrality( nodes, edges, top=20, method=nx.betweenness.betweenness_centrality ):
G=nx.Graph()
for (n,v) in edges.items():
G.add_edge(n[0],n[1],weight=v)
#dc = nx.betweenness_centrality(G)
#dc = nx.degree_centrality(G)
#dc = nx.betweenness.betweenness_centrality(G)
dc = method(G)
sorted_x = sorted(dc.items(), key=operator.itemgetter(1))
sorted_x.reverse()
for n,c in sorted_x[:top]:
print(round(c,4), "https://brunch.co.kr/@{id}".format(id=n))
## degree centrality
centrality(nodes, edges, top=30, method = nx.degree_centrality)
0.6667 https://brunch.co.kr/@yoojs8512 0.6333 https://brunch.co.kr/@lifidea 0.6167 https://brunch.co.kr/@suyoung 0.5667 https://brunch.co.kr/@brunchflgu 0.4833 https://brunch.co.kr/@suhanjang 0.4667 https://brunch.co.kr/@jimmyrim 0.3167 https://brunch.co.kr/@promise4u 0.3 https://brunch.co.kr/@mobiinside 0.3 https://brunch.co.kr/@sapu0000 0.2833 https://brunch.co.kr/@haneulalice 0.2667 https://brunch.co.kr/@wjchee 0.2667 https://brunch.co.kr/@insuk 0.2667 https://brunch.co.kr/@breakthrough 0.2667 https://brunch.co.kr/@pelexus 0.25 https://brunch.co.kr/@hmin0606 0.25 https://brunch.co.kr/@manya 0.2333 https://brunch.co.kr/@yoonash 0.2167 https://brunch.co.kr/@jihoonjeong 0.2167 https://brunch.co.kr/@sshong 0.2167 https://brunch.co.kr/@sooscape 0.1833 https://brunch.co.kr/@meanimize 0.1667 https://brunch.co.kr/@jsksoft 0.1667 https://brunch.co.kr/@jaeseungmun 0.15 https://brunch.co.kr/@genie7pe 0.15 https://brunch.co.kr/@borashow 0.1333 https://brunch.co.kr/@yper 0.1333 https://brunch.co.kr/@aboutheman 0.1333 https://brunch.co.kr/@sclplus 0.1167 https://brunch.co.kr/@yunjungseo 0.1 https://brunch.co.kr/@techsuda
## betweenness centrality
centrality(nodes, edges, top=30, method = nx.betweenness_centrality)
0.2459 https://brunch.co.kr/@lifidea 0.1909 https://brunch.co.kr/@yoojs8512 0.1715 https://brunch.co.kr/@suyoung 0.1615 https://brunch.co.kr/@brunchflgu 0.0448 https://brunch.co.kr/@jimmyrim 0.0448 https://brunch.co.kr/@suhanjang 0.0385 https://brunch.co.kr/@pelexus 0.0366 https://brunch.co.kr/@breakthrough 0.0247 https://brunch.co.kr/@insuk 0.0162 https://brunch.co.kr/@promise4u 0.0113 https://brunch.co.kr/@sapu0000 0.0105 https://brunch.co.kr/@hmin0606 0.0078 https://brunch.co.kr/@haneulalice 0.0076 https://brunch.co.kr/@mobiinside 0.0057 https://brunch.co.kr/@sshong 0.005 https://brunch.co.kr/@sooscape 0.0049 https://brunch.co.kr/@manya 0.0046 https://brunch.co.kr/@yoonash 0.0039 https://brunch.co.kr/@wjchee 0.0036 https://brunch.co.kr/@meanimize 0.0023 https://brunch.co.kr/@jihoonjeong 0.002 https://brunch.co.kr/@cloud09 0.001 https://brunch.co.kr/@yunjungseo 0.001 https://brunch.co.kr/@jaeseungmun 0.0005 https://brunch.co.kr/@genie7pe 0.0004 https://brunch.co.kr/@jsksoft 0.0003 https://brunch.co.kr/@speedlyh 0.0003 https://brunch.co.kr/@sophie89 0.0002 https://brunch.co.kr/@aboutheman 0.0002 https://brunch.co.kr/@rothem
## betweenness centrality
centrality(nodes, edges, top=30, method = nx.edge_betweenness_centrality)
0.0328 https://brunch.co.kr/@('lifidea', 'aidenswmo') 0.0328 https://brunch.co.kr/@('ajmind', 'breakthrough') 0.0328 https://brunch.co.kr/@('lifidea', 'cojette') 0.0328 https://brunch.co.kr/@('yoojs8512', 'jigjang') 0.0328 https://brunch.co.kr/@('pelexus', 'marsnine') 0.0328 https://brunch.co.kr/@('iamquadr', 'suyoung') 0.0328 https://brunch.co.kr/@('brunchflgu', 'hstyle84') 0.0328 https://brunch.co.kr/@('wanleehani', 'brunchflgu') 0.0328 https://brunch.co.kr/@('lifidea', 'madvirus') 0.0328 https://brunch.co.kr/@('lifidea', 'imagineer') 0.0328 https://brunch.co.kr/@('futureagent', 'brunchflgu') 0.0328 https://brunch.co.kr/@('jihere1001', 'suyoung') 0.0303 https://brunch.co.kr/@('yoojs8512', 'youngwungkim') 0.023 https://brunch.co.kr/@('lifidea', 'cleancode') 0.023 https://brunch.co.kr/@('lifidea', 'bwcho75') 0.0225 https://brunch.co.kr/@('lifidea', 'yoojs8512') 0.0216 https://brunch.co.kr/@('lifidea', 'brunchflgu') 0.0197 https://brunch.co.kr/@('yoojs8512', '2kija') 0.0186 https://brunch.co.kr/@('lifidea', 'suyoung') 0.0175 https://brunch.co.kr/@('suyoung', 'brunchflgu') 0.0172 https://brunch.co.kr/@('lifidea', 'pxdstory') 0.0156 https://brunch.co.kr/@('pxdstory', 'suyoung') 0.0153 https://brunch.co.kr/@('suyoung', 'rothem') 0.0152 https://brunch.co.kr/@('yoojs8512', 'cloud09') 0.015 https://brunch.co.kr/@('speedlyh', 'yoojs8512') 0.0146 https://brunch.co.kr/@('brunch4nrs', 'lifidea') 0.0146 https://brunch.co.kr/@('pelexus', 'lifidea') 0.0142 https://brunch.co.kr/@('yoojs8512', 'brunchflgu') 0.014 https://brunch.co.kr/@('cloud09', 'suyoung') 0.0137 https://brunch.co.kr/@('lifidea', 'breakthrough')
데이터, 디자인, 코딩, IT, 시각화, 스타트업, work&life,
from scipy.spatial import distance
from scipy import stats
import seaborn as sns
import scipy
def cos_cdist(matrix, vector):
"""
Compute the cosine distances between each row of matrix and vector.
"""
v = vector.reshape(1, -1)
return scipy.spatial.distance.cdist(matrix, v, 'cosine').reshape(-1)
def matrix_similarity( matrix_ds, dist='cosine'):
sim_mat = []
name = []
for idx, row in matrix_ds.iterrows():
name.append(idx)
sim_mat.append(cos_cdist( matrix_ds, row ).tolist())
ds = 1-pd.DataFrame(sim_mat, columns=name, index=name)
return ds
tmp_ds = net_ds[net_ds.target.isin(top_writer[:30].uid)]
writer_ds = pd.pivot_table( tmp_ds, index='target', columns='uid', values='dir', aggfunc=len, fill_value=0)
ds = matrix_similarity( writer_ds )
sns.set(style="whitegrid", palette="pastel", color_codes=True,font_scale=1.2)
cmap = sns.cubehelix_palette(as_cmap=True, rot=-.2, light=1.3)
cm = sns.clustermap(ds, cmap=cmap, linewidths=.5)
cm.fig.set_figwidth(15)
cm.fig.set_figheight(15)
for idx,row in top_writer[:30].sort('uid').iterrows():
print( "https://brunch.co.kr/@{uid} {name}".format(uid=row.uid, name=row['작가명']))
https://brunch.co.kr/@borashow 보라쇼 https://brunch.co.kr/@breakthrough 한국현 https://brunch.co.kr/@brunch4nrs 윤청하 https://brunch.co.kr/@brunchflgu 조우성 변호사 https://brunch.co.kr/@brunchlftm sbroh https://brunch.co.kr/@cojette cojette https://brunch.co.kr/@haneulalice Alice in wonderland https://brunch.co.kr/@hmin0606 신유민 https://brunch.co.kr/@insuk 조인석 chris https://brunch.co.kr/@jihoonjeong 정지훈 https://brunch.co.kr/@jimmyrim 임지훈 Jimmy Rim https://brunch.co.kr/@jsksoft 상상 https://brunch.co.kr/@lifidea Jin Young Kim https://brunch.co.kr/@manya 마냐 https://brunch.co.kr/@meanimize Meanimize https://brunch.co.kr/@mobiinside Mobiinside https://brunch.co.kr/@pelexus 최윤섭 https://brunch.co.kr/@promise4u 양준철 https://brunch.co.kr/@rothem Rothem https://brunch.co.kr/@sapu0000 송준협 https://brunch.co.kr/@sclplus 엄지용 https://brunch.co.kr/@sooscape 흔디 https://brunch.co.kr/@suhanjang 티거 Jang https://brunch.co.kr/@suyoung 강수영 https://brunch.co.kr/@techsuda 도안구의 테크수다 https://brunch.co.kr/@wjchee 지원준 https://brunch.co.kr/@yoojs8512 유재석 https://brunch.co.kr/@yoonash yoonash https://brunch.co.kr/@yper 문현구 YPER 대표 https://brunch.co.kr/@yunjungseo Yunjung Seo
UX, 디자인 관련 글이 주를 이루는 UX,디자인,기획을 업으로 하시는 작가분
개발자 문화에 관련된 글이 주를 이루는 개발을 업으로하시는 작가분(대체적으로 글수가 10개 이하로 적음)
스타트업 문화, 회사의 생활, 직무, 업무 멘토링 등과 관련된 글이 많은 작가분들임
IT 트랜드, 컬럼, 사회 현상에 대한 내용을 담은 작가분들임
4번 클러스터와 유사하게 IT/모바일 트렌드, 데이터과학 등 저널의 성향이 있는 작가분들임