%pylab inline
import time
from collections import defaultdict
import json
import pandas as pd
import os
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout
import numpy as np
import requests
import json
from datetime import datetime
from textblob import TextBlob
#import seaborn as sns
Populating the interactive namespace from numpy and matplotlib
df_s=pd.read_csv("seed_revisions.csv",sep="\t")
df_s.info()
df_s=df_s[["article","text","timestamp"]]
df_s=df_s[~df_s.text.isnull()]
df_s.timestamp=pd.to_datetime(df_s.timestamp)
print len(df_s)
df_s["outlinks"]=df_s.text.apply(lambda x: map( lambda y: y.split("]]")[0], x.split("[[")[1:]))
df_s["outlinks"]=df_s.outlinks.apply( lambda x: [ i for i in x if (not ":" in i) or (": " in i) ])
df_s["outlinks"]=df_s.outlinks.apply( lambda x: [ i for i in x if (not "http:" in i) and (not "https:" in i) \
and (not "Category:" in i) and (not "category:" in i) and (not "image:" in i) and (not "Image:" in i) \
and (not "file:" in i) and (not "File:" in i) and (not "fi:" in i) and (not "fr:" in i) \
and (not "de:" in i) and (not "ru:" in i) and (not "zh:" in i) and (not "vi:" in i) \
and (not "categoría:" in i) and (not "it:" in i) and (not "su:" in i) and (not "Special:" in i)\
and (not "#Name|" in i)])
df_s["outlinks_len"]=df_s.outlinks.apply(len)
print df_s.info()
df_s=df_s[~df_s.outlinks_len.isnull()]
df_s=df_s[["article","outlinks_len","timestamp"]]
df_s.timestamp=df_s.timestamp.apply(lambda x: x.date() )
df_s.drop_duplicates(["timestamp","article"],keep="first",inplace=True)
print len(df_s)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 63004 entries, 0 to 63003 Data columns (total 14 columns): Unnamed: 0 63004 non-null object _content_model 63004 non-null object _parent_id 63004 non-null int64 _sha1 62984 non-null object anon 63004 non-null object article 63004 non-null object comment 47173 non-null object minor 63003 non-null object revid 63002 non-null float64 rollbacktoken 0 non-null float64 text 62952 non-null object timestamp 63002 non-null object user 63002 non-null object text_len 62952 non-null float64 dtypes: float64(3), int64(1), object(10) memory usage: 6.7+ MB 62952 <class 'pandas.core.frame.DataFrame'> Int64Index: 62952 entries, 0 to 63003 Data columns (total 5 columns): article 62952 non-null object text 62952 non-null object timestamp 62952 non-null datetime64[ns] outlinks 62952 non-null object outlinks_len 62952 non-null int64 dtypes: datetime64[ns](1), int64(1), object(3) memory usage: 2.9+ MB None 31767
#for i in df_s.outlinks2.values:
# for j in i:
# #if (not "Wikipedia:" in j) and (not "Image:" in j) and (not "File:" in j) and (not "de:" in j) :"
# if len(j)>150:
# print j
df_s_w=pd.DataFrame(columns=["time_w","outlinks_len","article","week"])
g=df_s.groupby("article")
for article_name,series in g:
print article_name
created=series.timestamp.min()
print created
index=pd.date_range(start=created,end=datetime.today(),freq="W")
df1=pd.DataFrame(index,columns=["time_w"])
df1.time_w=df1.time_w.apply(lambda x: x.date() )
df1["outlinks_len"]=df1.time_w.apply(lambda x: series[series.timestamp<=x].iloc[0].outlinks_len)
df1["article"]=article_name
df1["week"]=df1.index
df_s_w=pd.concat([df_s_w,df1])
Ada_Yonath 2005-06-15 Adam_Riess 2006-07-13 Adrian_Bird 2009-02-05 Aharon_Razin 2009-10-07 Akira_Fujishima 2008-02-06 Akira_Suzuki_(chemist) 2010-03-21 Alain_Aspect 2004-12-21 Alan_Krueger 2006-08-23 Alberto_Alesina 2007-06-24 Allen_J._Bard 2008-07-07 Alvin_E._Roth 2008-06-27 Andre_Geim 2008-04-22 Andrew_Viterbi 2003-09-24 Angus_Deaton 2005-06-17 Anne_Osborn_Krueger 2004-03-05 Anthony_Pawson 2005-03-11 Anthony_R._Hunter 2007-03-31 Anton_Zeilinger 2004-08-21 Arieh_Warshel 2007-08-19 Armen_Alchian 2005-02-08 Arthur_B._McDonald 2007-11-02 Artur_Avila 2009-07-12 Aziz_Sancar 2006-06-06 B._Jayant_Baliga 2010-08-22 Barbara_Liskov 2005-02-15 Bernd_Giese 2009-10-10 Brian_Druker 2009-11-13 Brian_Kobilka 2008-10-16 Brian_Schmidt 2006-08-02 Bruce_Ames 2004-07-22 Bruce_Beutler 2007-11-27 C%C3%A9dric_Villani 2010-05-01 Carol_W._Greider 2006-11-27 Carolyn_R._Bertozzi 2006-11-27 Chad_Mirkin 2007-11-30 Charles_David_Allis 2006-06-02 Charles_F._Manski 2008-05-10 Charles_H._Bennett_(computer_scientist) 2005-03-19 Charles_K._Kao 2002-05-01 Charles_L._Bennett 2006-10-24 Charles_L._Kane 2013-02-06 Charles_Lee_(scientist) 2014-04-07 Charles_M._Lieber 2006-07-09 Charles_P._Thacker 2005-04-21 Charles_Sawyers 2009-11-11 Charles_T._Kresge 2013-02-19 Ching_W._Tang 2010-04-10 Christopher_A._Pissarides 2005-10-20 Christopher_A._Sims 2006-07-19 Dale_T._Mortensen 2005-03-13 Dan_Shechtman 2006-01-10 David_Card 2005-07-26 David_Forbes_Hendry 2007-01-09 David_J._Wineland 2009-10-09 David_Julius 2011-06-01 David_R._Smith 2009-10-10 David_Spergel 2006-07-24 Deborah_S._Jin 2004-01-30 Dennis_Slamon 2008-10-20 Didier_Queloz 2004-10-02 Douglas_Diamond 2011-09-25 Douglas_L._Coleman 2009-11-19 Edvard_Moser 2009-05-27 Ei-ichi_Negishi 2008-02-20 Eli_Yablonovitch 2004-03-06 Elinor_Ostrom 2006-05-06 Elizabeth_Blackburn 2004-09-16 Elon_Lindenstrauss 2010-08-15 Emmanuelle_Charpentier 2015-06-23 Endre_Szemer%C3%A9di 2004-05-18 Eric_Betzig 2014-10-08 Eric_H._Davidson 2006-11-18 Erkki_Ruoslahti 2008-04-01 Ernest_McCulloch 2005-07-24 Ernst_Fehr 2006-03-09 Eugene_Fama 2003-08-21 Ferenc_Krausz 2006-03-13 Fran%C3%A7ois_Englert 2006-12-10 Fran%C3%A7oise_Barr%C3%A9-Sinoussi 2008-10-06 G._David_Tilman 2006-11-20 Galen_D._Stucky 2007-08-01 Gary_Ruvkun 2008-05-25 Gary_Schuster 2008-10-07 Geoffrey_Marcy 2004-09-14 George_E._Smith 2006-01-05 Gilles_Brassard 2005-03-20 Gordon_Moore 2003-03-15 Gordon_Tullock 2005-02-11 Graeme_Moad 2014-09-25 Graham_Hutchings 2011-12-27 Halbert_White 2010-02-15 Harald_zur_Hausen 2008-05-25 Harold_Demsetz 2006-02-09 Hideo_Hosono 2013-02-11 Hideo_Ohno 2015-02-04 Hiroshi_Amano 2014-10-07 Howard_Cedar 2009-10-07 Irwin_M._Jacobs 2005-09-12 Isamu_Akasaki 2007-01-06 Israel_Kirzner 2003-05-03 Jack_W._Szostak 2007-11-02 Jacqueline_Barton 2004-11-14 Jacques_Miller 2005-07-19 Jacques_Tits 2003-05-01 James_E._Darnell 2006-09-17 James_Rothman 2005-03-04 James_Till 2005-07-04 Jean_Fr%C3%A9chet 2005-10-21 Jean_Tirole 2006-03-20 Jeffrey_I._Gordon 2008-02-19 Jeffrey_M._Friedman 2006-10-01 Jennifer_Doudna 2012-08-26 Jerry_A._Hausman 2004-08-12 John_A._List 2006-01-12 John_B._Goodenough 2005-03-14 John_Clauser 2010-06-01 John_Forbes_Nash_Jr. 2002-10-11 John_G._Thompson 2004-07-28 John_Gurdon 2004-11-05 John_Hardman_Moore 2017-06-17 John_L._Hennessy 2004-01-21 John_Milnor 2002-07-12 John_O%27Keefe_(neuroscientist) 2013-11-23 John_Pendry 2006-02-07 John_Tate 2003-06-14 Jordi_Gal%C3%AD 2007-12-14 Joseph_Altman 2006-07-23 Joseph_Felsenstein 2006-01-23 Joshua_Angrist 2007-11-26 Juan_Ignacio_Cirac_Sasturain 2006-10-18 Judea_Pearl 2004-06-03 Jules_A._Hoffmann 2007-11-27 Karl_Barry_Sharpless 2003-12-21 Kazutoshi_Mori 2014-02-05 Kevin_M._Murphy 2004-12-29 Konstantin_Novoselov 2010-09-16 Krzysztof_Matyjaszewski 2007-12-15 Lars_Peter_Hansen 2005-12-03 Laurens_W._Molenkamp 2012-09-20 Leigh_Canham 2006-04-02 Lene_Hau 2004-12-20 Leslie_Lamport 2003-03-13 Leslie_Valiant 2005-10-26 Lloyd_Shapley 2004-11-05 Louis_E._Brus 2008-05-28 Louis_Nirenberg 2005-04-29 Luc_Montagnier 2004-07-17 Lyman_Page 2007-04-18 M._Hashem_Pesaran 2006-05-02 M_Stanley_Whittingham 2006-10-30 Makoto_Kobayashi_(physicist) 2007-04-25 Manjul_Bhargava 2006-05-11 Mark_Gertler_(economist) 2007-10-14 Mark_Granovetter 2004-06-03 Martin_Chalfie 2008-10-08 Martin_Feldstein 2004-09-21 Martin_Hairer 2010-04-11 Martin_Hellman 2004-01-06 Martin_Karplus 2006-03-20 Martin_Weitzman 2007-12-18 Maryam_Mirzakhani 2006-08-12 Masatoshi_Takeichi 2012-09-25 Matthew_Rabin 2005-02-21 May-Britt_Moser 2009-06-01 Michael_Berry_(physicist) 2003-09-19 Michael_Gr%C3%A4tzel 2008-06-23 Michael_Grunstein 2006-08-31 Michael_Levitt 2006-03-17 Michael_Stonebraker 2002-07-22 Michael_Wigler 2007-02-25 Michel_Mayor 2004-04-03 Mikhail_Leonidovich_Gromov 2004-07-02 Mildred_Dresselhaus 2004-12-01 Morris_Chang 2005-08-13 Nadrian_Seeman 2010-06-04 Nancy_A._Moran 2010-05-07 Ng%C3%B4_B%E1%BA%A3o_Ch%C3%A2u 2007-05-09 Nicholas_Lydon 2009-11-13 Nobuhiro_Kiyotaki 2007-11-30 Oliver_E._Williamson 2003-04-06 Omar_M._Yaghi 2008-05-09 Osamu_Shimomura 2008-10-08 Patrick_O._Brown 2006-03-25 Paul_Alivisatos 2005-06-21 Paul_Corkum 2009-06-30 Paul_Krugman 2003-09-07 Paul_L._Modrich 2007-04-05 Peidong_Yang 2011-07-16 Peter_C._B._Phillips 2006-03-10 Peter_Crane 2005-10-04 Peter_Diamond 2006-10-02 Peter_Higgs 2003-10-12 Peter_Howitt_(economist) 2009-10-15 Peter_Walter 2006-04-28 Peter_Zoller 2006-09-11 Philippe_Aghion 2008-12-17 Pierre_Deligne 2003-12-09 Ralph_M._Steinman 2007-09-17 Ramamoorthy_Ramesh 2014-09-25 Randy_Schekman 2005-03-04 Richard_Blundell 2006-10-15 Richard_F._Heck 2008-06-15 Richard_Hynes 2011-12-27 Richard_Peto 2005-08-14 Richard_Posner 2003-06-14 Robert_Edwards_(physiologist) 2005-03-31 Robert_G._Roeder 2005-03-04 Robert_H._Dennard 2004-03-22 Robert_J._Shiller 2004-11-08 Robert_Lefkowitz 2007-06-14 Robert_S._Langer 2005-06-28 Robert_Tjian 2008-10-05 Roger_Penrose 2001-11-05 Roger_Y._Tsien 2007-09-15 Rory_Collins 2008-04-23 Ryoo_Ryong 2011-12-07 Sajeev_John 2004-01-06 Sam_Peltzman 2006-07-12 Satoshi_%C5%8Cmura 2015-10-05 Saul_Perlmutter 2005-01-28 Seiji_Ogawa 2006-07-18 Serge_Haroche 2007-12-11 Shafi_Goldwasser 2004-07-19 Shimon_Sakaguchi 2015-10-02 Shinya_Yamanaka 2007-11-23 Shizuo_Akira 2008-05-28 Shoucheng_Zhang 2011-12-16 Shuji_Nakamura 2004-09-30 Silvio_Micali 2005-01-11 Stanislav_Smirnov 2009-05-10 Stefan_Hell 2006-11-24 Stephen_E._Harris 2013-01-11 Stephen_J._Lippard 2010-04-16 Stephen_Ross_(economist) 2004-11-25 Stephen_W._Scherer 2010-10-15 Steven_Van_Slyke 2006-11-28 Takaaki_Kajita 2015-10-06 Thomas_A._Steitz 2009-10-07 Thomas_C._S%C3%BCdhof 2010-04-08 Thomas_Ebbesen 2008-11-15 Thomas_J._Sargent 2005-03-15 Tomas_Lindahl 2008-11-10 Tony_Atkinson 2006-03-13 Toshihide_Maskawa 2007-04-25 Tu_Youyou 2011-09-12 Venkatraman_Ramakrishnan 2005-12-14 Vera_Rubin 2004-02-17 Victor_Ambros 2008-05-25 Whitfield_Diffie 2004-01-11 Willard_Boyle 2006-01-05 William_Baumol 2004-12-07 William_C._Campbell_(scientist) 2015-10-05 William_E._Moerner 2006-07-04 William_Nordhaus 2006-02-10 William_Wootters 2006-01-15 Winslow_Briggs 2015-09-27 Yakir_Aharonov 2006-04-05 Yakov_Sinai 2005-04-17 Yoichiro_Nambu 2004-10-13 Yoshinori_Ohsumi 2012-11-18 Yoshinori_Tokura 2014-09-23 Zhong_Lin_Wang 2015-10-06
#group data by week number
df_s_w.info()
g=df_s_w.groupby("week")
s_w_st_dict=defaultdict()
for week_name,series in g:
s_w_st_dict[week_name]={"mean_w":series.outlinks_len.mean(),"std_1":series.outlinks_len.sem(),
"count":series.count().outlinks_len}
#create df
df_s_w_st=pd.DataFrame.from_dict(s_w_st_dict,orient="index")
df_s_w_st=df_s_w_st[df_s_w_st["count"]>10]
df_s_w_st.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 147317 entries, 0 to 130 Data columns (total 4 columns): time_w 147317 non-null object outlinks_len 147317 non-null object article 147317 non-null object week 147317 non-null object dtypes: object(4) memory usage: 5.6+ MB <class 'pandas.core.frame.DataFrame'> Int64Index: 774 entries, 0 to 773 Data columns (total 3 columns): count 774 non-null int64 mean_w 774 non-null float64 std_1 774 non-null float64 dtypes: float64(2), int64(1) memory usage: 24.2 KB
df_b2=pd.read_csv("baseline_revisions.csv",sep="\t")
df_b2.info()
df_b2=df_b2[["article","text","timestamp"]]
df_b2=df_b2[~df_b2.text.isnull()]
df_b2.timestamp=pd.to_datetime(df_b2.timestamp)
print len(df_b2)
df_b2["outlinks"]=df_b2.text.apply(lambda x: map( lambda y: y.split("]]")[0], x.split("[[")[1:]))
df_b2["outlinks"]=df_b2.outlinks.apply( lambda x: [ i for i in x if (not ":" in i) or (": " in i) ])
df_b2["outlinks"]=df_b2.outlinks.apply( lambda x: [ i for i in x if (not "http:" in i) and (not "https:" in i) \
and (not "Category:" in i) and (not "category:" in i) and (not "image:" in i) and (not "Image:" in i) \
and (not "file:" in i) and (not "File:" in i) and (not "fi:" in i) and (not "fr:" in i) \
and (not "de:" in i) and (not "ru:" in i) and (not "zh:" in i) and (not "vi:" in i) \
and (not "categoría:" in i) and (not "it:" in i) and (not "su:" in i) and (not "Special:" in i)\
and (not "#Name|" in i)])
<class 'pandas.core.frame.DataFrame'> RangeIndex: 24163 entries, 0 to 24162 Data columns (total 14 columns): Unnamed: 0 24163 non-null int64 _content_model 24163 non-null object _parent_id 24163 non-null int64 _sha1 24132 non-null object anon 24163 non-null bool article 24163 non-null object comment 17697 non-null object minor 24163 non-null bool revid 24163 non-null int64 rollbacktoken 0 non-null float64 text 24126 non-null object timestamp 24163 non-null object user 24163 non-null object text_len 24126 non-null float64 dtypes: bool(2), float64(2), int64(3), object(7) memory usage: 2.3+ MB 24126
df_b=pd.read_csv("baseline_revisions.csv",sep="\t")
df_b.info()
df_b=df_b[["article","text","timestamp"]]
df_b=df_b[~df_b.text.isnull()]
df_b.timestamp=pd.to_datetime(df_b.timestamp)
print len(df_b)
df_b["outlinks"]=df_b.text.apply(lambda x: map( lambda y: y.split("]]")[0], x.split("[[")[1:]))
df_b["outlinks"]=df_b.outlinks.apply( lambda x: [ i for i in x if (not ":" in i) or (": " in i) ])
df_b["outlinks"]=df_b.outlinks.apply( lambda x: [ i for i in x if (not "http:" in i) and (not "https:" in i) \
and (not "Category:" in i) and (not "category:" in i) and (not "image:" in i) and (not "Image:" in i) \
and (not "file:" in i) and (not "File:" in i) and (not "fi:" in i) and (not "fr:" in i) \
and (not "de:" in i) and (not "ru:" in i) and (not "zh:" in i) and (not "vi:" in i) \
and (not "categoría:" in i) and (not "it:" in i) and (not "su:" in i) and (not "Special:" in i)\
and (not "Media:" in i) and (not "#Name|" in i) and (not "pt:" in i) and (not "Catyegory:" in i)])
df_b["outlinks_len"]=df_b.outlinks.apply(len)
print df_b.info()
df_b=df_b[~df_b.outlinks_len.isnull()]
df_b=df_b[["article","outlinks_len","timestamp"]]
df_b.timestamp=df_b.timestamp.apply(lambda x: x.date() )
df_b.drop_duplicates(["timestamp","article"],keep="first",inplace=True)
print len(df_b)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 24163 entries, 0 to 24162 Data columns (total 14 columns): Unnamed: 0 24163 non-null int64 _content_model 24163 non-null object _parent_id 24163 non-null int64 _sha1 24132 non-null object anon 24163 non-null bool article 24163 non-null object comment 17697 non-null object minor 24163 non-null bool revid 24163 non-null int64 rollbacktoken 0 non-null float64 text 24126 non-null object timestamp 24163 non-null object user 24163 non-null object text_len 24126 non-null float64 dtypes: bool(2), float64(2), int64(3), object(7) memory usage: 2.3+ MB 24126 <class 'pandas.core.frame.DataFrame'> Int64Index: 24126 entries, 0 to 24162 Data columns (total 5 columns): article 24126 non-null object text 24126 non-null object timestamp 24126 non-null datetime64[ns] outlinks 24126 non-null object outlinks_len 24126 non-null int64 dtypes: datetime64[ns](1), int64(1), object(3) memory usage: 1.1+ MB None 13642
df_b_w=pd.DataFrame(columns=["time_w","outlinks_len","article","week"])
g=df_b.groupby("article")
for article_name,series in g:
print article_name
created=series.timestamp.min()
#print created
index=pd.date_range(start=created,end=datetime.today(),freq="W")
df1=pd.DataFrame(index,columns=["time_w"])
df1.time_w=df1.time_w.apply(lambda x: x.date() )
df1["outlinks_len"]=df1.time_w.apply(lambda x: series[series.timestamp<=x].iloc[0].outlinks_len)
df1["article"]=article_name
df1["week"]=df1.index
df_b_w=pd.concat([df_b_w,df1])
%C4%90%C3%A0m_Thanh_S%C6%A1n %C5%BDeljko_Reiner Achim_M%C3%BCller Akiko_Iwasaki Alan_Aderem Alan_Ashworth Albert-L%C3%A1szl%C3%B3_Barab%C3%A1si Albert_Meyers Alex_Bateman Alex_Zettl Alexander_Zamolodchikov Ali_Khademhosseini Allen_Steere Amir_Yacoby Andrei_Shleifer Andrew_Bernard Andrew_H._Van_de_Ven Annarosa_Leri Anne_O%27Garra Antonio_Lanzavecchia Ariel_Rubinstein Armin_Falk Arun_Majumdar Axel_D._Becke Axel_Ullrich Babak_Hassibi Barry_Everitt Bengt_R._Holmstr%C3%B6m Bernhard_Keimer Bernhard_Keller Beth_Levine_(physician) Birger_Wernerfelt Biswanath_Mukherjee Bradley_Efron Brigitta_Stockinger Bruce_D._Walker C._N._R._Rao Campbell_Harvey Carlo_La_Vecchia Cees_Dekker Charles_M._Lieber Chen_Guanrong Chih-Jen_Lin Christoph_Gerber Cisca_Wijmenga Claire_Berger Cumrun_Vafa Dale_L._Boger Dan_Ariely Dan_Boneh Dariush_Mozaffarian Daron_Acemo%C4%9Flu David_Haussler David_Milstein Deborah_Estrin Dennis_P._Curran Didier_Astruc Dieter_Enders Dimitri_Nanopoulos Donal_Bradley Edward_Glaeser Edward_Miguel Edward_Ott Edward_R._Dougherty Elias_James_Corey Elza_Erkip Eric_Hanushek Esther_Duflo Eugene_Braunwald Ewan_Birney Federica_Sallusto Fiona_Powrie Fran%C3%A7ois_Diederich Frank_Glorius Frank_Neese Frans_Van_de_Werf G%C3%A1bor_A._Somorjai G%C3%A9rald_Bastard G%C3%A9rard_F%C3%A9rey George_Davey_Smith George_Loewenstein George_Sawatzky Giorgio_Parisi Giuseppe_Mingione Gon%C3%A7alo_Abecasis Graham_Colditz Graham_Fleming Guido_Imbens Hans_Clevers Helmut_Ringsdorf Helmut_Schwarz Heng_Li Henk_Volberda Henri_Berestycki Henry_Chesbrough Hongjie_Dai Horst_St%C3%B6cker Hyeon_Taeghwan Ian_Affleck Ian_F._Akyildiz Ian_Ford Idun_Reiten Igor_Jurisica Immanuel_Bloch Ingrid_Daubechies Ira_Pastan Irun_Cohen Ivan_K._Schuller Jack_Cuzick James_R._Heath Jean-Louis_Vincent Jean-Luc_Br%C3%A9das Jean-Marie_Tarascon Jean-Michel_Sav%C3%A9ant Jean-Pierre_Sauvage Jean_Fr%C3%A9chet Jeffrey_A._Harvey Jenny_Nelson Jens_Nielsen Jerome_Groopman Jianqing_Fan Jing_Li_(chemist) Joan_Massagu%C3%A9_Sol%C3%A9 John_A._List John_Danesh John_Perdew John_Robertson_(physicist) John_Y._Campbell Jon_Clardy Jose_Baselga Joseph_Schlessinger Joseph_Wang Joshua_Jortner Juan_Mart%C3%ADn_Maldacena Julian_Birkinshaw Jun_Ye K%C3%A1ri_Stef%C3%A1nsson Keith_Fagnou Keith_Olive Keith_Usherwood_Ingold Kendall_Houk Kenneth_Kendler Kim_Kimoon Kun-Liang_Guan Kurt_Binder Lars_E._O._Svensson Leo_Paquette Liao_Shijun Luigi_Zingales Luis_Oro Lynn_Schneemeyer Malcolm_Chisholm Malcolm_Green_(chemist) Marc_A._Kastner Marc_Melitz Marianne_Bertrand Marlan_Scully Martin_Eichenbaum Masaru_Tomita Matja%C5%BE_Perc Matthias_Egger Matthias_Mann Menachem_Elimelech Michael_Boehnke Michael_F._Lappert Michael_I._Jordan Michael_L._Gross_(chemist) Michael_L._Klein Michael_L._Tushman Michael_Lounsbury Michal_Lipson Michele_Parrinello Mietek_Jaroniec Murray_Brennan Napoleone_Ferrara Nathan_Seiberg Nello_Cristianini Nicholas_Bloom Nicholas_C._Handy Oded_Schramm Olivier_Blanchard Pan_Jianwei Paul_McEuen Paul_Ridker Paul_Volberding Pauline_van_den_Driessche Peer_Bork Peter_Gavin_Hall Peter_Grassberger Peter_Kalmus Peter_Knight_(scientist) Peter_Ozsv%C3%A1th Peter_Reinhard_Hansen Peter_Sleight Phaedon_Avouris Philip_Kim Rainer_Blatt Ralph_Nuzzo Ralph_Weissleder Reinhart_Ahlrichs Richard_H._Holm Richard_Roll Richard_Wilson_(physicist) Robert_H._Crabtree Robert_J._Birgeneau Robert_Lusch Robert_Peter_Gale Robert_Tibshirani Robert_West_(chemist) Roberto_Car Rodney_J._Bartlett Rolf_Apweiler Roman_Jackiw Ronald_Fedkiw Ross_Levine Rudolf_Grimm Rudolf_Jaenisch Ruedi_Aebersold Ruslan_Medzhitov Saeid_Abbasbandy Salim_Yusuf Salvador_Moncada Samuel_Broder Sang-Wook_Cheong Sankar_Das_Sarma Sendhil_Mullainathan Sergey_Fomin Sergio_Ferrara Sharon_Ann_Hunt Shin%27ichi_Nojiri Shlomo_Shamai Shuji_Ogino Siamon_Gordon Simeon_Djankov Simon_Johnson_(economist) Simon_White Spyros_Makridakis Stefanie_Dimmeler Stephen_Dunnett Stephen_J._Lippard Stephen_L._Buchwald Stephen_MacMahon Stephen_R._Bloom Stephen_Vargo Steven_Kaplan_(economist) Stuart_A._Rice Stuart_Parkin Subir_Sachdev Subra_Suresh Tadamitsu_Kishimoto Tadatsugu_Taniguchi Takuzo_Aida Tasawar_Hayat Theodore_H._Geballe Thomas_J.R._Hughes Thomas_Starzl Tilman_Esslinger Tim_Bollerslev Tomas_H%C3%B6kfelt Trevor_Hastie Ulrike_Malmendier Uri_Gneezy Valarie_Zeithaml Valentin_Fuster Victor_Dzau Vincenzo_Balzani Wayne_Fuller Werner_Hacke William_Easterly William_H._Miller_(chemistry) Witold_Nazarewicz Xavier_Gabaix Yang_Huanming Younan_Xia Zolt%C3%A1n_Szab%C3%B3_(mathematician)
#group data by week number
df_b_w.info()
g=df_b_w.groupby("week")
b_w_st_dict=defaultdict()
for week_name,series in g:
b_w_st_dict[week_name]={"mean_w":series.outlinks_len.mean(),"std_1":series.outlinks_len.sem(),
"count":series.count().outlinks_len}
#create df
df_b_w_st=pd.DataFrame.from_dict(b_w_st_dict,orient="index")
df_b_w_st=df_b_w_st[df_b_w_st["count"]>10]
df_b_w_st.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 121020 entries, 0 to 581 Data columns (total 4 columns): time_w 121020 non-null object outlinks_len 121020 non-null object article 121020 non-null object week 121020 non-null object dtypes: object(4) memory usage: 4.6+ MB <class 'pandas.core.frame.DataFrame'> Int64Index: 689 entries, 0 to 688 Data columns (total 3 columns): count 689 non-null int64 mean_w 689 non-null float64 std_1 689 non-null float64 dtypes: float64(2), int64(1) memory usage: 21.5 KB
fig, (ax0, ax1) = plt.subplots(ncols=2, sharey=True,figsize=[20,10])
ax0.errorbar(df_b_w_st.index.values, df_b_w_st.mean_w, df_b_w_st.std_1,linestyle='None')
ax0.plot(df_b_w_st.index.values, df_b_w_st.mean_w,"b",linewidth = 1)
ax0.set_xlabel("week")
ax0.set_ylabel("outlink counts")
ax0.set_title("Length of articles of non-awarded scientists. 0 week is the week of article creation")
ax1.errorbar(df_s_w_st.index.values, df_s_w_st.mean_w, df_s_w_st.std_1,linestyle='None')
ax1.plot(df_s_w_st.index.values, df_s_w_st.mean_w,"b",linewidth = 1)
ax1.set_xlabel("week")
ax1.set_title('Length of articles of awarded scientists. 0 week is the week of article creation')
plt.subplots_adjust(wspace=0.01, hspace=0.01)
plt.show()
with open("seed_creation_date.json","r") as f:
sci_aw_dict=json.load(f,encoding="utf-8")
df_sci_aw=pd.DataFrame.from_dict(sci_aw_dict,orient="index")
df_sci_aw.index=df_sci_aw.apply(lambda x: x.name.split("/")[-1],axis=1)
df_sci_aw.Award_date=pd.to_datetime(df_sci_aw.Award_date)
df_sci_aw.Award_date=df_sci_aw.Award_date.apply(lambda x: x.date() )
df_s_w_aw=pd.DataFrame(columns=["time_w","outlinks_len","article","week"])
g=df_s.groupby("article")
for article_name,series in g:
print article_name
created=series.timestamp.min()
#print created
#after award
index=pd.date_range(start=df_sci_aw.loc[article_name].Award_date,end=datetime.today(),freq="W")
index2=pd.date_range(end=df_sci_aw.loc[article_name].Award_date,start=created,freq="W")
df1_1=pd.DataFrame(index,columns=["time_w"])
df1_1.time_w=df1_1.time_w.apply(lambda x: x.date() )
if (created - df_sci_aw.loc[article_name].Award_date)>(created -created):
df1_1=df1_1[df1_1.time_w>=created]
df1_1["outlinks_len"]=df1_1.time_w.apply(lambda x: series[series.timestamp<=x].iloc[0].outlinks_len)
#skip df1_2
df1=df1_1[["time_w","outlinks_len"]]
print "==> created after award"
else:
#print df1_1
df1_2=pd.DataFrame(index2,columns=["time_w"])
df1_2.time_w=df1_2.time_w.apply(lambda x: x.date() )
df1_2.index=df1_2.apply(lambda x: x.name-len(df1_2),axis=1)
#print df1_2
df1_1["outlinks_len"]=df1_1.time_w.apply(lambda x: series[series.timestamp<=x].iloc[0].outlinks_len)
#print df1_1
df1_2["outlinks_len"]=df1_2.time_w.apply(lambda x: series[series.timestamp<=x].iloc[0].outlinks_len)
df1=pd.concat([df1_1,df1_2])
df1["article"]=article_name
df1["week"]=df1.index
df_s_w_aw=pd.concat([df_s_w_aw,df1])
Ada_Yonath Adam_Riess Adrian_Bird Aharon_Razin Akira_Fujishima Akira_Suzuki_(chemist) Alain_Aspect Alan_Krueger Alberto_Alesina Allen_J._Bard Alvin_E._Roth Andre_Geim Andrew_Viterbi Angus_Deaton Anne_Osborn_Krueger Anthony_Pawson Anthony_R._Hunter Anton_Zeilinger Arieh_Warshel Armen_Alchian Arthur_B._McDonald Artur_Avila Aziz_Sancar B._Jayant_Baliga Barbara_Liskov Bernd_Giese ==> created after award Brian_Druker Brian_Kobilka Brian_Schmidt Bruce_Ames Bruce_Beutler C%C3%A9dric_Villani Carol_W._Greider Carolyn_R._Bertozzi Chad_Mirkin Charles_David_Allis Charles_F._Manski Charles_H._Bennett_(computer_scientist) Charles_K._Kao Charles_L._Bennett Charles_L._Kane Charles_Lee_(scientist) Charles_M._Lieber Charles_P._Thacker Charles_Sawyers Charles_T._Kresge Ching_W._Tang Christopher_A._Pissarides Christopher_A._Sims Dale_T._Mortensen Dan_Shechtman David_Card David_Forbes_Hendry David_J._Wineland David_Julius David_R._Smith ==> created after award David_Spergel Deborah_S._Jin Dennis_Slamon Didier_Queloz Douglas_Diamond Douglas_L._Coleman Edvard_Moser Ei-ichi_Negishi Eli_Yablonovitch Elinor_Ostrom Elizabeth_Blackburn Elon_Lindenstrauss Emmanuelle_Charpentier Endre_Szemer%C3%A9di Eric_Betzig Eric_H._Davidson Erkki_Ruoslahti Ernest_McCulloch Ernst_Fehr Eugene_Fama Ferenc_Krausz Fran%C3%A7ois_Englert Fran%C3%A7oise_Barr%C3%A9-Sinoussi G._David_Tilman Galen_D._Stucky Gary_Ruvkun Gary_Schuster Geoffrey_Marcy George_E._Smith Gilles_Brassard Gordon_Moore Gordon_Tullock Graeme_Moad Graham_Hutchings Halbert_White Harald_zur_Hausen Harold_Demsetz Hideo_Hosono Hideo_Ohno ==> created after award Hiroshi_Amano Howard_Cedar Irwin_M._Jacobs Isamu_Akasaki Israel_Kirzner Jack_W._Szostak Jacqueline_Barton Jacques_Miller Jacques_Tits James_E._Darnell James_Rothman James_Till Jean_Fr%C3%A9chet Jean_Tirole Jeffrey_I._Gordon Jeffrey_M._Friedman Jennifer_Doudna Jerry_A._Hausman John_A._List John_B._Goodenough John_Clauser John_Forbes_Nash_Jr. John_G._Thompson John_Gurdon John_Hardman_Moore ==> created after award John_L._Hennessy John_Milnor John_O%27Keefe_(neuroscientist) John_Pendry John_Tate Jordi_Gal%C3%AD Joseph_Altman Joseph_Felsenstein Joshua_Angrist Juan_Ignacio_Cirac_Sasturain Judea_Pearl Jules_A._Hoffmann Karl_Barry_Sharpless Kazutoshi_Mori Kevin_M._Murphy Konstantin_Novoselov ==> created after award Krzysztof_Matyjaszewski Lars_Peter_Hansen Laurens_W._Molenkamp Leigh_Canham Lene_Hau Leslie_Lamport Leslie_Valiant Lloyd_Shapley Louis_E._Brus Louis_Nirenberg Luc_Montagnier Lyman_Page M._Hashem_Pesaran M_Stanley_Whittingham Makoto_Kobayashi_(physicist) Manjul_Bhargava Mark_Gertler_(economist) Mark_Granovetter Martin_Chalfie Martin_Feldstein Martin_Hairer Martin_Hellman Martin_Karplus Martin_Weitzman Maryam_Mirzakhani Masatoshi_Takeichi Matthew_Rabin May-Britt_Moser Michael_Berry_(physicist) Michael_Gr%C3%A4tzel Michael_Grunstein Michael_Levitt Michael_Stonebraker Michael_Wigler Michel_Mayor Mikhail_Leonidovich_Gromov Mildred_Dresselhaus Morris_Chang Nadrian_Seeman Nancy_A._Moran Ng%C3%B4_B%E1%BA%A3o_Ch%C3%A2u Nicholas_Lydon Nobuhiro_Kiyotaki Oliver_E._Williamson Omar_M._Yaghi Osamu_Shimomura Patrick_O._Brown Paul_Alivisatos Paul_Corkum Paul_Krugman Paul_L._Modrich Peidong_Yang Peter_C._B._Phillips Peter_Crane Peter_Diamond Peter_Higgs Peter_Howitt_(economist) Peter_Walter Peter_Zoller Philippe_Aghion Pierre_Deligne Ralph_M._Steinman Ramamoorthy_Ramesh Randy_Schekman Richard_Blundell Richard_F._Heck Richard_Hynes Richard_Peto Richard_Posner Robert_Edwards_(physiologist) Robert_G._Roeder Robert_H._Dennard Robert_J._Shiller Robert_Lefkowitz Robert_S._Langer Robert_Tjian Roger_Penrose Roger_Y._Tsien Rory_Collins Ryoo_Ryong Sajeev_John Sam_Peltzman Satoshi_%C5%8Cmura Saul_Perlmutter Seiji_Ogawa Serge_Haroche Shafi_Goldwasser Shimon_Sakaguchi Shinya_Yamanaka Shizuo_Akira Shoucheng_Zhang Shuji_Nakamura Silvio_Micali Stanislav_Smirnov Stefan_Hell Stephen_E._Harris ==> created after award Stephen_J._Lippard Stephen_Ross_(economist) Stephen_W._Scherer Steven_Van_Slyke Takaaki_Kajita Thomas_A._Steitz Thomas_C._S%C3%BCdhof Thomas_Ebbesen Thomas_J._Sargent Tomas_Lindahl Tony_Atkinson Toshihide_Maskawa Tu_Youyou Venkatraman_Ramakrishnan Vera_Rubin Victor_Ambros Whitfield_Diffie Willard_Boyle William_Baumol William_C._Campbell_(scientist) William_E._Moerner William_Nordhaus William_Wootters Winslow_Briggs ==> created after award Yakir_Aharonov Yakov_Sinai Yoichiro_Nambu Yoshinori_Ohsumi Yoshinori_Tokura Zhong_Lin_Wang
#group data by week number
df_s_w_aw.info()
g=df_s_w_aw.groupby("week")
s_w_st_dict=defaultdict()
for week_name,series in g:
s_w_st_dict[week_name]={"mean_w":series.outlinks_len.mean(),"std_1":series.outlinks_len.sem(),
"count":series.count().outlinks_len}
#create df
df_s_w_st_aw=pd.DataFrame.from_dict(s_w_st_dict,orient="index")
print len(df_s_w_st_aw)
df_s_w_st_aw=df_s_w_st_aw[df_s_w_st_aw["count"]>10]
print len(df_s_w_st_aw)
df_s_w_st_aw.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 144959 entries, 0 to 121 Data columns (total 4 columns): time_w 144959 non-null object outlinks_len 144959 non-null object article 144959 non-null object week 144959 non-null object dtypes: object(4) memory usage: 5.5+ MB 1125 1009 <class 'pandas.core.frame.DataFrame'> Int64Index: 1009 entries, -522 to 486 Data columns (total 3 columns): count 1009 non-null int64 mean_w 1009 non-null float64 std_1 1009 non-null float64 dtypes: float64(2), int64(1) memory usage: 31.5 KB
plt.errorbar(df_s_w_st_aw.index.values, df_s_w_st_aw.mean_w, df_s_w_st_aw.std_1,linestyle='None')
plt.plot(df_s_w_st_aw.index.values, df_s_w_st_aw.mean_w,"b",linewidth = 1)
#plt.xlim(0,df_s_w_st.iloc[-1].name)
#plt.ylim(0,5000)
plt.xlabel("week")
plt.ylabel("word counts")
plt.title("Length of articles of awarded scientists. 0 week is the week of award");
print df_s_w_st_aw.loc[10]
print df_s_w_st_aw.loc[2]
print df_s_w_st_aw.loc[1]
print df_s_w_st_aw.loc[0]
print df_s_w_st_aw.loc[-1]
print df_s_w_st_aw.loc[-2]
print df_s_w_st_aw.loc[-10]
count 257.000000 mean_w 44.112840 std_1 1.987037 Name: 10, dtype: float64 count 257.000000 mean_w 42.824903 std_1 1.897014 Name: 2, dtype: float64 count 257.000000 mean_w 42.583658 std_1 1.883930 Name: 1, dtype: float64 count 257.000000 mean_w 42.252918 std_1 1.843108 Name: 0, dtype: float64 count 245.000000 mean_w 37.057143 std_1 1.781814 Name: -1, dtype: float64 count 243.000000 mean_w 36.798354 std_1 1.801585 Name: -2, dtype: float64 count 238.000000 mean_w 36.714286 std_1 1.869888 Name: -10, dtype: float64
def random_date(start, end, position=None):
start, end = pd.Timestamp(start), pd.Timestamp(end)
delta = (end - start).total_seconds()
if position is None:
offset = np.random.uniform(0., delta)
else:
offset = position * delta
offset = pd.offsets.Second(offset)
t = start + offset
return t
print df_b.timestamp.min()
print df_b.timestamp.max()
print df_sci_aw.Award_date.min()
print df_sci_aw.Award_date.max()
2003-01-22 2018-01-01 2008-03-27 2015-10-12
#start=df_b.timestamp.min()
start=df_sci_aw.Award_date.min()
end=df_sci_aw.Award_date.max()
#end=df_b.timestamp.max()
df_s_w_naw=pd.DataFrame(columns=["time_w","outlinks_len","article","week"])
g=df_b.groupby("article")
for article_name,series in g:
print article_name
created=series.timestamp.min()
#print created
#after award
award_date=random_date(start, end)
index=pd.date_range(start=award_date,end=datetime.today(),freq="W")
#print index
#before award
index2=pd.date_range(end=award_date,start=created,freq="W")
df1_1=pd.DataFrame(index,columns=["time_w"])
df1_1.time_w=df1_1.time_w.apply(lambda x: x.date() )
if (created -award_date.date())>(created-created):
df1_1=df1_1[df1_1.time_w>=created]
df1_1["outlinks_len"]=df1_1.time_w.apply(lambda x: series[series.timestamp<=x].iloc[0].outlinks_len)
#skip df1_2
df1=df1_1[["time_w","outlinks_len"]]
print "==> created after award"
else:
#print df1_1
df1_2=pd.DataFrame(index2,columns=["time_w"])
df1_2.time_w=df1_2.time_w.apply(lambda x: x.date() )
df1_2.index=df1_2.apply(lambda x: x.name-len(df1_2),axis=1)
#print df1_2
df1_1["outlinks_len"]=df1_1.time_w.apply(lambda x: series[series.timestamp<=x].iloc[0].outlinks_len)
#print df1_1
df1_2["outlinks_len"]=df1_2.time_w.apply(lambda x: series[series.timestamp<=x].iloc[0].outlinks_len)
df1=pd.concat([df1_1,df1_2])
df1["article"]=article_name
df1["week"]=df1.index
df_s_w_naw=pd.concat([df_s_w_naw,df1])
%C4%90%C3%A0m_Thanh_S%C6%A1n %C5%BDeljko_Reiner ==> created after award Achim_M%C3%BCller Akiko_Iwasaki Alan_Aderem Alan_Ashworth Albert-L%C3%A1szl%C3%B3_Barab%C3%A1si Albert_Meyers Alex_Bateman ==> created after award Alex_Zettl Alexander_Zamolodchikov Ali_Khademhosseini Allen_Steere Amir_Yacoby ==> created after award Andrei_Shleifer Andrew_Bernard Andrew_H._Van_de_Ven Annarosa_Leri ==> created after award Anne_O%27Garra Antonio_Lanzavecchia Ariel_Rubinstein Armin_Falk Arun_Majumdar Axel_D._Becke Axel_Ullrich Babak_Hassibi Barry_Everitt Bengt_R._Holmstr%C3%B6m ==> created after award Bernhard_Keimer ==> created after award Bernhard_Keller ==> created after award Beth_Levine_(physician) ==> created after award Birger_Wernerfelt Biswanath_Mukherjee ==> created after award Bradley_Efron Brigitta_Stockinger ==> created after award Bruce_D._Walker ==> created after award C._N._R._Rao Campbell_Harvey ==> created after award Carlo_La_Vecchia Cees_Dekker Charles_M._Lieber Chen_Guanrong ==> created after award Chih-Jen_Lin ==> created after award Christoph_Gerber Cisca_Wijmenga ==> created after award Claire_Berger Cumrun_Vafa Dale_L._Boger Dan_Ariely Dan_Boneh Dariush_Mozaffarian ==> created after award Daron_Acemo%C4%9Flu David_Haussler David_Milstein Deborah_Estrin Dennis_P._Curran ==> created after award Didier_Astruc Dieter_Enders Dimitri_Nanopoulos Donal_Bradley Edward_Glaeser Edward_Miguel ==> created after award Edward_Ott ==> created after award Edward_R._Dougherty Elias_James_Corey Elza_Erkip ==> created after award Eric_Hanushek Esther_Duflo Eugene_Braunwald Ewan_Birney Federica_Sallusto Fiona_Powrie ==> created after award Fran%C3%A7ois_Diederich Frank_Glorius Frank_Neese ==> created after award Frans_Van_de_Werf G%C3%A1bor_A._Somorjai G%C3%A9rald_Bastard G%C3%A9rard_F%C3%A9rey George_Davey_Smith ==> created after award George_Loewenstein George_Sawatzky ==> created after award Giorgio_Parisi Giuseppe_Mingione ==> created after award Gon%C3%A7alo_Abecasis ==> created after award Graham_Colditz ==> created after award Graham_Fleming Guido_Imbens ==> created after award Hans_Clevers Helmut_Ringsdorf Helmut_Schwarz Heng_Li Henk_Volberda ==> created after award Henri_Berestycki ==> created after award Henry_Chesbrough Hongjie_Dai Horst_St%C3%B6cker Hyeon_Taeghwan ==> created after award Ian_Affleck ==> created after award Ian_F._Akyildiz Ian_Ford Idun_Reiten ==> created after award Igor_Jurisica ==> created after award Immanuel_Bloch Ingrid_Daubechies Ira_Pastan Irun_Cohen Ivan_K._Schuller Jack_Cuzick ==> created after award James_R._Heath Jean-Louis_Vincent ==> created after award Jean-Luc_Br%C3%A9das Jean-Marie_Tarascon ==> created after award Jean-Michel_Sav%C3%A9ant Jean-Pierre_Sauvage Jean_Fr%C3%A9chet Jeffrey_A._Harvey Jenny_Nelson Jens_Nielsen ==> created after award Jerome_Groopman Jianqing_Fan Jing_Li_(chemist) Joan_Massagu%C3%A9_Sol%C3%A9 John_A._List John_Danesh ==> created after award John_Perdew John_Robertson_(physicist) John_Y._Campbell Jon_Clardy ==> created after award Jose_Baselga ==> created after award Joseph_Schlessinger Joseph_Wang Joshua_Jortner Juan_Mart%C3%ADn_Maldacena Julian_Birkinshaw ==> created after award Jun_Ye ==> created after award K%C3%A1ri_Stef%C3%A1nsson Keith_Fagnou Keith_Olive ==> created after award Keith_Usherwood_Ingold Kendall_Houk Kenneth_Kendler ==> created after award Kim_Kimoon ==> created after award Kun-Liang_Guan Kurt_Binder Lars_E._O._Svensson Leo_Paquette Liao_Shijun Luigi_Zingales Luis_Oro ==> created after award Lynn_Schneemeyer Malcolm_Chisholm Malcolm_Green_(chemist) Marc_A._Kastner Marc_Melitz Marianne_Bertrand ==> created after award Marlan_Scully Martin_Eichenbaum ==> created after award Masaru_Tomita Matja%C5%BE_Perc ==> created after award Matthias_Egger ==> created after award Matthias_Mann Menachem_Elimelech Michael_Boehnke ==> created after award Michael_F._Lappert ==> created after award Michael_I._Jordan Michael_L._Gross_(chemist) Michael_L._Klein Michael_L._Tushman ==> created after award Michael_Lounsbury Michal_Lipson Michele_Parrinello Mietek_Jaroniec ==> created after award Murray_Brennan ==> created after award Napoleone_Ferrara Nathan_Seiberg Nello_Cristianini Nicholas_Bloom ==> created after award Nicholas_C._Handy Oded_Schramm Olivier_Blanchard Pan_Jianwei ==> created after award Paul_McEuen Paul_Ridker Paul_Volberding Pauline_van_den_Driessche ==> created after award Peer_Bork Peter_Gavin_Hall Peter_Grassberger Peter_Kalmus ==> created after award Peter_Knight_(scientist) ==> created after award Peter_Ozsv%C3%A1th Peter_Reinhard_Hansen Peter_Sleight ==> created after award Phaedon_Avouris Philip_Kim Rainer_Blatt Ralph_Nuzzo Ralph_Weissleder Reinhart_Ahlrichs Richard_H._Holm Richard_Roll Richard_Wilson_(physicist) ==> created after award Robert_H._Crabtree Robert_J._Birgeneau Robert_Lusch Robert_Peter_Gale Robert_Tibshirani Robert_West_(chemist) Roberto_Car ==> created after award Rodney_J._Bartlett Rolf_Apweiler Roman_Jackiw Ronald_Fedkiw Ross_Levine ==> created after award Rudolf_Grimm Rudolf_Jaenisch Ruedi_Aebersold Ruslan_Medzhitov Saeid_Abbasbandy Salim_Yusuf Salvador_Moncada Samuel_Broder Sang-Wook_Cheong ==> created after award Sankar_Das_Sarma Sendhil_Mullainathan Sergey_Fomin Sergio_Ferrara Sharon_Ann_Hunt ==> created after award Shin%27ichi_Nojiri ==> created after award Shlomo_Shamai Shuji_Ogino ==> created after award Siamon_Gordon Simeon_Djankov Simon_Johnson_(economist) Simon_White Spyros_Makridakis ==> created after award Stefanie_Dimmeler ==> created after award Stephen_Dunnett ==> created after award Stephen_J._Lippard Stephen_L._Buchwald ==> created after award Stephen_MacMahon Stephen_R._Bloom ==> created after award Stephen_Vargo Steven_Kaplan_(economist) Stuart_A._Rice Stuart_Parkin Subir_Sachdev Subra_Suresh Tadamitsu_Kishimoto Tadatsugu_Taniguchi ==> created after award Takuzo_Aida Tasawar_Hayat Theodore_H._Geballe Thomas_J.R._Hughes Thomas_Starzl Tilman_Esslinger Tim_Bollerslev Tomas_H%C3%B6kfelt Trevor_Hastie ==> created after award Ulrike_Malmendier ==> created after award Uri_Gneezy ==> created after award Valarie_Zeithaml ==> created after award Valentin_Fuster Victor_Dzau Vincenzo_Balzani Wayne_Fuller Werner_Hacke William_Easterly William_H._Miller_(chemistry) ==> created after award Witold_Nazarewicz Xavier_Gabaix Yang_Huanming Younan_Xia ==> created after award Zolt%C3%A1n_Szab%C3%B3_(mathematician)
#group data by week number
df_s_w_naw.info()
g=df_s_w_naw.groupby("week")
s_w_st_dict=defaultdict()
for week_name,series in g:
s_w_st_dict[week_name]={"mean_w":series.outlinks_len.mean(),"std_1":series.outlinks_len.sem(),
"count":series.count().outlinks_len}
#create df
df_s_w_st_naw=pd.DataFrame.from_dict(s_w_st_dict,orient="index")
print len(df_s_w_st_naw)
df_s_w_st_naw=df_s_w_st_naw[df_s_w_st_naw["count"]>10]
print len(df_s_w_st_naw)
df_s_w_st_naw.info()
plt.errorbar(df_s_w_st_naw.index.values, df_s_w_st_naw.mean_w, df_s_w_st_naw.std_1,linestyle='None')
plt.plot(df_s_w_st_naw.index.values, df_s_w_st_naw.mean_w,"b",linewidth = 1)
#plt.xlim(0,df_s_w_st_naw.iloc[-1].name)
#plt.ylim(0,5000)
plt.xlabel("week")
plt.ylabel("outlink counts")
plt.title("Length of articles of non-awarded scientists. 0 week is the week of award");
<class 'pandas.core.frame.DataFrame'> Int64Index: 118562 entries, 0 to -1 Data columns (total 4 columns): time_w 118562 non-null object outlinks_len 118562 non-null object article 118562 non-null object week 118562 non-null object dtypes: object(4) memory usage: 4.5+ MB 1070 979 <class 'pandas.core.frame.DataFrame'> Int64Index: 979 entries, -477 to 501 Data columns (total 3 columns): count 979 non-null int64 mean_w 979 non-null float64 std_1 979 non-null float64 dtypes: float64(2), int64(1) memory usage: 30.6 KB
#import matplotlib as mpl
#import matplotlib.pyplot as plt
#import seaborn.apionly as sns
#import imp
#imp.reload(mpl); imp.reload(plt); imp.reload(sns)
<module 'seaborn.apionly' from 'C:\Anaconda2\lib\site-packages\seaborn\apionly.pyc'>
#save
df_s_w_st_aw.to_csv("data/awarded_inf_supply_outlinks_data_forplot.csv",sep="\t")
df_s_w_st_naw.to_csv("data/non-awarded_inf_supply_outlinks_data_forplot.csv",sep="\t")
df_s_w_st.to_csv("data/awarded_inf_supply_outlinks_0week_data_forplot.csv",sep="\t")
df_b_w_st.to_csv("data/non-awarded_inf_supply_outlinks_0week_data_forplot.csv",sep="\t")
#load
df_s_w_st=pd.DataFrame.from_csv("data/awarded_inf_supply_outlinks_0week_data_forplot.csv",sep="\t")
df_b_w_st=pd.DataFrame.from_csv("data/non-awarded_inf_supply_outlinks_0week_data_forplot.csv",sep="\t")
df_s_w_st_aw=pd.DataFrame.from_csv("data/awarded_inf_supply_outlinks_data_forplot.csv",sep="\t")
df_s_w_st_naw=pd.DataFrame.from_csv("data/non-awarded_inf_supply_outlinks_data_forplot.csv",sep="\t")
from matplotlib.container import ErrorbarContainer
from matplotlib.legend_handler import HandlerErrorbar
fig, (ax0, ax1) = plt.subplots(ncols=2, sharey=True,figsize=[20,10],dpi=500)
ax0.errorbar(df_s_w_st_aw.index.values, df_s_w_st_aw.mean_w, df_s_w_st_aw.std_1,c="b",alpha=0.7,#lw=2,
linestyle='None',label="awarded scientist, std error")#,label='_nolegend_')
ax0.plot(df_s_w_st_aw.index.values, df_s_w_st_aw.mean_w,"darkblue",linewidth = 1,label="awarded scientist, mean")
ax0.set_xlim(df_s_w_st_aw.iloc[0].name,df_s_w_st_aw.iloc[-1].name)
ax0.set_ylim(10,175)
ax0.set_xlabel("week")
ax0.set_ylabel("otlink counts")
ax0.set_title("Length of article; 0 week is the week of award")
ax0.errorbar(df_s_w_st_naw.index.values, df_s_w_st_naw.mean_w, df_s_w_st_naw.std_1,c="green",alpha=0.7,#lw=2,
linestyle='None',label="non-awarded scientist, std error")#label='_nolegend_'
ax0.plot(df_s_w_st_naw.index.values, df_s_w_st_naw.mean_w,"lime",linewidth = 1,label="non-awarded scientist, mean")
ax0.plot([0,0],[10,175],"r--",label="week of award")
leg=ax0.legend(loc=2,handler_map={ErrorbarContainer: HandlerErrorbar(numpoints=15,xpad=0.1)})
for lh in leg.legendHandles:
lh.set_alpha(1)
ax1.errorbar(df_s_w_st.index.values, df_s_w_st.mean_w, df_s_w_st.std_1,c="b",alpha=0.8,lw=2,
linestyle='None',label="awarded scientist, std error")#,label='_nolegend_')
ax1.plot(df_s_w_st.index.values, df_s_w_st.mean_w,"darkblue",linewidth = 1,label="awarded scientist, mean")
ax1.errorbar(df_b_w_st.index.values, df_b_w_st.mean_w, df_b_w_st.std_1,c="green",alpha=0.8,lw=2,
linestyle='None',label="non-awarded scientist, std error")#label='_nolegend_'
ax1.plot(df_b_w_st.index.values, df_b_w_st.mean_w,"lime",linewidth = 1,label="non-awarded scientist, mean")
#ax1.legend(loc=2)
leg=ax1.legend(loc=2,handler_map={ErrorbarContainer: HandlerErrorbar(numpoints=15,xpad=0.1)})
for lh in leg.legendHandles:
lh.set_alpha(1)
ax1.set_xlim(df_s_w_st.iloc[0].name,df_s_w_st.iloc[-1].name)
ax1.set_xlabel("week")
#plt.ylabel("word counts")
ax1.set_title('Length of article; 0 week is the week when the article about scientists was created')
plt.subplots_adjust(wspace=0.01, hspace=0.01)
plt.tight_layout()
plt.show()
from matplotlib.container import ErrorbarContainer
from matplotlib.legend_handler import HandlerErrorbar
fig, (ax0, ax1) = plt.subplots(ncols=2, sharey=True,figsize=[8,4],dpi=500)
ax0.errorbar(df_s_w_st_aw.index.values, df_s_w_st_aw.mean_w, df_s_w_st_aw.std_1,c="b",alpha=0.7,#lw=2,
linestyle='None',label="Awarded scientist, std error")#,label='_nolegend_')
ax0.plot(df_s_w_st_aw.index.values, df_s_w_st_aw.mean_w,"darkblue",linewidth = 1,label="Awarded scientist, mean")
ax0.set_xlim(df_s_w_st_aw.iloc[0].name,df_s_w_st_aw.iloc[-1].name)
ax0.set_ylim(10,175)
ax0.set_xlabel("Week")
ax0.set_ylabel("Otlink counts")
#ax0.set_title("Length of article; 0 week is the week of award")
ax0.errorbar(df_s_w_st_naw.index.values, df_s_w_st_naw.mean_w, df_s_w_st_naw.std_1,c="green",alpha=0.7,#lw=2,
linestyle='None',label="Non-awarded scientist, std error")#label='_nolegend_'
ax0.plot(df_s_w_st_naw.index.values, df_s_w_st_naw.mean_w,"lime",linewidth = 1,label="Non-awarded scientist, mean")
ax0.plot([0,0],[10,175],"r--",label="Week of award")
leg=ax0.legend(loc=2,handler_map={ErrorbarContainer: HandlerErrorbar(numpoints=15,xpad=0.1)})
for lh in leg.legendHandles:
lh.set_alpha(1)
ax1.errorbar(df_s_w_st.index.values, df_s_w_st.mean_w, df_s_w_st.std_1,c="b",alpha=0.8,lw=2,
linestyle='None',label="Awarded scientist, std error")#
ax1.plot(df_s_w_st.index.values, df_s_w_st.mean_w,"darkblue",linewidth = 1,label="Awarded scientist, mean")
ax1.errorbar(df_b_w_st.index.values, df_b_w_st.mean_w, df_b_w_st.std_1,c="green",alpha=0.8,lw=2,
linestyle='None',label="Non-awarded scientist, std error")#
ax1.plot(df_b_w_st.index.values, df_b_w_st.mean_w,"lime",linewidth = 1,label="Non-awarded scientist, mean")
#ax1.legend(loc=2)
leg=ax1.legend(loc=2,handler_map={ErrorbarContainer: HandlerErrorbar(numpoints=15,xpad=0.1)})
for lh in leg.legendHandles:
lh.set_alpha(1)
ax1.set_xlim(df_s_w_st.iloc[0].name,df_s_w_st.iloc[-1].name)
ax1.set_xlabel("Week")
#plt.ylabel("word counts")
#ax1.set_title('Length of article; 0 week is the week when the article about scientists was created')
plt.tight_layout()
plt.subplots_adjust(wspace=0.03, hspace=0.03)
#plt.savefig("plots/outlink_count.pdf",dpi=500)
plt.savefig("plots/outlink_count.png",dpi=500)
plt.show()