from __future__ import division, print_function
try:
import ConfigParser as configparser
except ImportError:
import configparser
import copy
import json
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import requests
import scipy.spatial
import time
import tweepy
from urllib import quote
# Read configuration file for request user-agent and Twitter tokens
config = configparser.ConfigParser()
config.read(os.path.expanduser('~/etc/python.cfg'))
headers = {
'User-Agent': config.get('requests', 'user_agent'),
'From': config.get('requests', 'from')
}
# Fetch name of Brede Wiki page with researchers on Google Scholar and Twitter
url_gst = ('http://neuro.compute.dtu.dk/w/api.php?'
'action=query&format=json&list=categorymembers&'
'cmtitle=Category:Researchers in Google Scholar and Twitter')
response = requests.get(url_gst, headers=headers).json()
pagetitles = [page['title'] for page in response['query']['categorymembers']]
while 'query-continue' in response:
url_continue = url_gst + '&cmcontinue=' + response['query-continue']['categorymembers']['cmcontinue'].encode('utf-8')
response = requests.get(url_continue).json()
pagetitles.extend([page['title'] for page in response['query']['categorymembers']])
print(pagetitles)
# Get researcher data from Brede Wiki
url_pages = "http://neuro.compute.dtu.dk/w/index.php?action=raw&title="
pattern_researcher = re.compile('{{Researcher(\s*?\|.*?)}}',
re.DOTALL | re.IGNORECASE | re.UNICODE)
pattern_fields = re.compile(r'\s*\|\s*(\w+)\s*=\s*([^\|]*\w)',
re.DOTALL | re.UNICODE)
researchers = []
for pagetitle in pagetitles:
response = requests.get(url_pages + quote(pagetitle.encode('utf-8')), headers=headers)
print(pagetitle)
researcher = pattern_researcher.findall(response.text)
if researcher:
researchers.append(dict(pattern_fields.findall(researcher[0])))
else:
print("Problem with " + pagetitle)
researchers[14]
url_gs = 'http://scholar.google.com/citations?user='
headers = {
'User-Agent': config.get('requests', 'user_agent'),
'From': config.get('requests', 'from')
}
#
2537 |
pattern_gscount = re.compile('(\d+) | ')
def get_google_scholar_counts(google_scholar_id):
response = requests.get(url_gs + google_scholar_id,
headers=headers)
counts = dict(zip(['citations', 'citations5', 'h-index', 'h-index5',
'i10-index', 'i10-index5'],
map(int, pattern_gscount.findall(response.text))))
return counts
# Yong-Yeol Ahn check
get_google_scholar_counts('US7OSNgAAAAJ')
# Get data from Google Scholar
for researcher in researchers:
if 'citations' not in researcher:
print(researcher['name'])
researcher.update(get_google_scholar_counts(researcher['googlescholar']))
time.sleep(5)
# Saving just in case
json.dump(researchers, open('researchers.json', 'w'))
# Twitter authentication
auth = tweepy.OAuthHandler(config.get('twitter', 'consumer_key'),
config.get('twitter', 'consumer_secret'))
auth.set_access_token(config.get('twitter', 'access_token'),
config.get('twitter', 'access_secret'))
# Function to download data from Twitter profiles
api = tweepy.API(auth)
def get_twitter_count(twitter_id):
try:
user = api.get_user(twitter_id)
counts = {
'Followers count': user.followers_count,
'Friends count': user.friends_count,
'Statuses count': user.statuses_count
}
return counts
except Exception:
print('Problem with ' + twitter_id)
return {}
# Testing with Finn Aarup Nielsen (fnielsen)
get_twitter_count('fnielsen')
# Download data from Twitter
for researcher in researchers:
researcher.update(get_twitter_count(researcher['twitter']))
print(researcher['name'])
# Save just in case
json.dump(researchers, open('researchers.json', 'w'))
researchers = json.load(open('researchers.json'))
researchers[0]
# Pandas!
df = pd.DataFrame(researchers)
%matplotlib inline
# isnan: Houston, we've had a problem
indices = (~np.isnan(df['citations'])) & (df['citations'] != 0)
reverse_index = indices[indices].index.values
# Plot the data
matplotlib.rc('font', family='DejaVu Sans')
fig = df.plot(x='citations', y='Followers count',
kind='scatter', figsize=(15, 10),
marker='*', s=df['Statuses count']/10,
linewidth=2, color=(0.8, 0.8, 0.8))
ax = plt.gca()
ax.set_xscale('log')
ax.set_yscale('log')
plt.xlabel('Google Scholar citations')
plt.ylabel('Twitter followers count')
plt.title('Kardashian index for Brede Wiki researchers on Google Scholar and Twitter')
# Power law fit
p = np.polyfit(np.log(df.ix[indices, 'citations']), np.log(df.ix[indices, 'Followers count']), 1)
powerlaw = np.frompyfunc(lambda x: np.exp(p[1]) * x ** p[0], 1, 1)
plt.plot([1, 200000], powerlaw([1, 200000]), linewidth=5, color=(0.5, 1, 0.5))
plt.text(10, 5000, '{:.3} x citations^{:0.2}'.format(np.exp(p[1]), p[0]), fontsize=20)
# Annotation of some of the points with researcher names
hull = scipy.spatial.ConvexHull(df.ix[indices, ['citations', 'Followers count']])
for index in hull.vertices:
x, y, name = df.ix[reverse_index[index], ['citations', 'Followers count', 'name']].values
try:
plt.text(x, y, name, horizontalalignment='center', verticalalignment='center')
except:
pass
# Myself and Ryoto and et al.
family_names = ['Nielsen', 'Tomioka', 'Willighagen']
for family_name in family_names:
x, y, name = df.ix[df['family_name'] == family_name, ['citations', 'Followers count', 'name']].values.flatten()
plt.text(x, y, name, horizontalalignment='center', verticalalignment='center')
dummy = plt.axis((1, 200000, 1, 20000))
plt.show()
df.describe()
df['K-index'] = df['Followers count'] / powerlaw(df['citations'])
# Identify the 'scientific Kardashians'
high_score = df[indices].sort(columns='K-index', ascending=False)[['name', 'K-index', 'Statuses count']]
high_score
# Not all is shown above
# The below code will give the full list:
# https://stackoverflow.com/questions/23388810/ipython-notebook-output-cell-is-truncating-contents-of-my-list
from IPython.display import HTML
HTML(high_score.to_html())