import pandas as pd
def load_jcr_old(year):
columns_to_load = ['ISSN', 'Impact Factor', '{' + str(year) + '} Articles']
tbl = pd.read_csv('data/SCIE_JCR{}.csv'.format(year), skiprows=2)[columns_to_load]
tbl.columns = ('ISSN', 'IF'+str(year), 'A'+str(year))
return tbl.set_index('ISSN')
tbl2000 = load_jcr_old(2000)
tbl2000.head()
IF2000 | A2000 | |
---|---|---|
ISSN | ||
0149-1423 | 1.494 | 81 |
0942-8925 | 0.866 | 126 |
0025-5858 | 0.232 | 27 |
1069-6563 | 1.419 | 152 |
1040-2446 | 1.554 | 229 |
ls data
2013_IF.csv SCIE_JCR2004.csv SCIE_JCR2009.csv SCIE_JCR2012-category.csv SCIE_JCR2000.csv SCIE_JCR2005.csv SCIE_JCR2010-category.csv SCIE_JCR2012.csv SCIE_JCR2001.csv SCIE_JCR2006.csv SCIE_JCR2010.csv SCIE_JCR2002.csv SCIE_JCR2007.csv SCIE_JCR2011-category.csv SCIE_JCR2003.csv SCIE_JCR2008.csv SCIE_JCR2011.csv
oldtables = [load_jcr_old(y) for y in range(2000, 2013)]
from functools import reduce
alloldtables = reduce(lambda x, y: pd.merge(x, y, how='outer', left_index=True, right_index=True),
oldtables).dropna(subset=('A2012',))
alloldtables.tail()
IF2000 | A2000 | IF2001 | A2001 | IF2002 | A2002 | IF2003 | A2003 | IF2004 | A2004 | ... | IF2008 | A2008 | IF2009 | A2009 | IF2010 | A2010 | IF2011 | A2011 | IF2012 | A2012 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ISSN | |||||||||||||||||||||
8756-3282 | 3.998 | 199 | 3.247 | 169 | 3.755 | 247 | 3.572 | 191 | 3.530 | 283 | ... | 4.145 | 275 | 4.089 | 307 | 4.601 | 339 | 4.023 | 340 | 3.823 | 309 |
8756-5641 | 0.871 | 25 | 0.914 | 23 | 1.351 | 26 | 1.672 | 33 | 1.953 | 37 | ... | 1.964 | 36 | 2.321 | 48 | 2.440 | 42 | 2.556 | 57 | 2.899 | 40 |
8756-758X | 0.675 | 99 | 0.693 | 75 | 0.701 | 105 | 0.706 | 106 | 0.673 | 101 | ... | 0.934 | 91 | 0.835 | 88 | 0.894 | 75 | 0.847 | 91 | 0.861 | 100 |
8756-7938 | 1.897 | 163 | 1.821 | 164 | 1.734 | 207 | 1.488 | 267 | 1.635 | 260 | ... | 2.108 | 173 | 2.398 | 208 | 2.178 | 210 | 2.340 | 199 | 1.853 | 185 |
8756-971X | 0.763 | 60 | 0.521 | 29 | 0.717 | 26 | 0.811 | 78 | 0.691 | 85 | ... | 0.894 | 100 | 0.906 | 83 | 1.066 | 49 | NaN | NaN | 0.755 | 79 |
5 rows × 26 columns
Citation report for 2013 was downloaded from Dr. N's dropbox, and converted to CSV using Excel.
tbl = pd.read_csv('data/2013_IF.csv', skiprows=1, na_values=('Not Available',), thousands=','
).drop_duplicates()[['Full Journal Title', 'Journal Impact Factor', 'Total Cites']]
tbl['Full Journal Title'] = tbl['Full Journal Title'].apply(lambda x: x.upper() if isinstance(x, str) else '')
tbl.head()
Full Journal Title | Journal Impact Factor | Total Cites | |
---|---|---|---|
0 | CA-A CANCER JOURNAL FOR CLINICIANS | 162.500 | 16130 |
1 | NEW ENGLAND JOURNAL OF MEDICINE | 54.420 | 257469 |
2 | CHEMICAL REVIEWS | 45.661 | 124463 |
3 | REVIEWS OF MODERN PHYSICS | 42.860 | 37647 |
4 | NATURE | 42.351 | 590324 |
The 2013 table lacks both ISSN and total 2013 articles. Infer the article count.
tbl['A2013'] = (tbl['Total Cites'] / tbl['Journal Impact Factor'] / 2).fillna(0).astype(int)
tbl['IF2013'] = tbl['Journal Impact Factor']
tbl.head()
Full Journal Title | Journal Impact Factor | Total Cites | A2013 | IF2013 | |
---|---|---|---|---|---|
0 | CA-A CANCER JOURNAL FOR CLINICIANS | 162.500 | 16130 | 49 | 162.500 |
1 | NEW ENGLAND JOURNAL OF MEDICINE | 54.420 | 257469 | 2365 | 54.420 |
2 | CHEMICAL REVIEWS | 45.661 | 124463 | 1362 | 45.661 |
3 | REVIEWS OF MODERN PHYSICS | 42.860 | 37647 | 439 | 42.860 |
4 | NATURE | 42.351 | 590324 | 6969 | 42.351 |
jcr2013 = tbl[['A2013', 'IF2013', 'Full Journal Title']]
We need more metadata for category-filtering and matching 2000-2012 and 2013 by journal titles.
category2012 = pd.read_csv('data/SCIE_JCR2012-category.csv', skiprows=2)[[
'ISSN', 'Subject Category', 'Journal Title (Full)', 'Abbreviated Journal Title']]
category2012.head()
ISSN | Subject Category | Journal Title (Full) | Abbreviated Journal Title | |
---|---|---|---|---|
0 | 0814-6039 | ACOUSTICS | ACOUSTICS AUSTRALIA | ACOUST AUST |
1 | 1063-7710 | ACOUSTICS | ACOUSTICAL PHYSICS | ACOUST PHYS+ |
2 | 1610-1928 | ACOUSTICS | ACTA ACUSTICA UNITED WITH ACUSTICA | ACTA ACUST UNITED AC |
3 | 0003-682X | ACOUSTICS | APPLIED ACOUSTICS | APPL ACOUST |
4 | 0137-5075 | ACOUSTICS | ARCHIVES OF ACOUSTICS | ARCH ACOUST |
selected_categories = set(['BIOCHEMICAL RESEARCH METHODS',
'BIOCHEMISTRY & MOLECULAR BIOLOGY','BIOLOGY','BIOTECHNOLOGY & APPLIED MICROBIOLOGY',
'CELL BIOLOGY','BIOPHYSICS','CRYSTALLOGRAPHY',
'DEVELOPMENTAL BIOLOGY','GENETICS & HEREDITY','IMMUNOLOGY','MATHEMATICAL & COMPUTATIONAL BIOLOGY',
'MICROBIOLOGY','MYCOLOGY','MULTIDISCIPLINARY SCIENCES',
'NEUROSCIENCES','PLANT SCIENCES','VIROLOGY'])
selectedcategory2012 = category2012[category2012['Subject Category'].isin(selected_categories)]
len(category2012), len(selectedcategory2012)
(13216, 1945)
We have loaded three tables so far. Merge them into one.
merge1 = pd.merge(selectedcategory2012, alloldtables, how='inner', left_on='ISSN', right_index=True)
merge2 = pd.merge(merge1, jcr2013, how='inner', left_on='Journal Title (Full)',
right_on='Full Journal Title').sort('IF2013', ascending=False)
merge2.head()
ISSN | Subject Category | Journal Title (Full) | Abbreviated Journal Title | IF2000 | A2000 | IF2001 | A2001 | IF2002 | A2002 | ... | A2009 | IF2010 | A2010 | IF2011 | A2011 | IF2012 | A2012 | A2013 | IF2013 | Full Journal Title | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1443 | 0028-0836 | MULTIDISCIPLINARY SCIENCES | NATURE | NATURE | 25.814 | 1315 | 27.955 | 939 | 30.432 | 889 | ... | 866 | 36.104 | 862 | 36.280 | 841 | 38.597 | 869 | 6969 | 42.351 | NATURE |
1192 | 0732-0582 | IMMUNOLOGY | ANNUAL REVIEW OF IMMUNOLOGY | ANNU REV IMMUNOL | 50.340 | 31 | 46.233 | 24 | 54.455 | 26 | ... | 24 | 49.271 | 22 | 52.761 | 23 | 36.556 | 28 | 201 | 41.392 | ANNUAL REVIEW OF IMMUNOLOGY |
1154 | 1471-0056 | GENETICS & HEREDITY | NATURE REVIEWS GENETICS | NAT REV GENET | NaN | 18 | 12.333 | 66 | 21.762 | 83 | ... | 75 | 32.745 | 71 | 38.075 | 71 | 41.063 | 70 | 331 | 39.794 | NATURE REVIEWS GENETICS |
827 | 1087-0156 | BIOTECHNOLOGY & APPLIED MICROBIOLOGY | NATURE BIOTECHNOLOGY | NAT BIOTECHNOL | 11.542 | 195 | 11.310 | 153 | 12.822 | 160 | ... | 103 | 31.090 | 117 | 23.268 | 84 | 32.438 | 92 | 539 | 39.080 | NATURE BIOTECHNOLOGY |
828 | 1474-1776 | BIOTECHNOLOGY & APPLIED MICROBIOLOGY | NATURE REVIEWS DRUG DISCOVERY | NAT REV DRUG DISCOV | NaN | NaN | NaN | NaN | NaN | 91 | ... | 65 | 28.712 | 67 | 29.008 | 61 | 33.078 | 43 | 288 | 37.231 | NATURE REVIEWS DRUG DISCOVERY |
5 rows × 33 columns
merge2.to_csv('JCRMerged.csv')
gtbl = merge2
trend_first_year = 2011
trend_last_year = 2013
trendfit_years = range(trend_first_year, trend_last_year+1)
trendfit_if_labels = ['IF{}'.format(y) for y in trendfit_years]
for y in trendfit_years:
gtbl['LOGIF{}'.format(y)] = np.log2(gtbl['IF{}'.format(y)])
logif_labels = ['LOGIF{}'.format(y) for y in trendfit_years]
gtbl['trend_slope'] = gtbl.apply(
(lambda x: np.polyfit(trendfit_years, x[logif_labels], 1)[0]), axis=1)
gtbl.ix[:, -6:].head()
IF2013 | Full Journal Title | LOGIF2011 | LOGIF2012 | LOGIF2013 | trend_slope | |
---|---|---|---|---|---|---|
1443 | 42.351 | NATURE | 5.181103 | 5.270417 | 5.404324 | 0.111611 |
1192 | 41.392 | ANNUAL REVIEW OF IMMUNOLOGY | 5.721400 | 5.192036 | 5.371280 | -0.175060 |
1154 | 39.794 | NATURE REVIEWS GENETICS | 5.250772 | 5.359767 | 5.314479 | 0.031853 |
827 | 39.080 | NATURE BIOTECHNOLOGY | 4.540275 | 5.019613 | 5.288359 | 0.374042 |
828 | 37.231 | NATURE REVIEWS DRUG DISCOVERY | 4.858379 | 5.047800 | 5.218432 | 0.180027 |
gtbl['trend_slope'].hist(range=(-2, 2), bins=50)
<matplotlib.axes.AxesSubplot at 0x7f96be3ba208>
gtbl['recent_if'] = (
gtbl[trendfit_if_labels].product(axis=1) ** (1/len(trendfit_years)))
full_if_years = np.arange(2003, 2014)
full_if_labels = ['IF{}'.format(y) for y in full_if_years]
years_zero_centered = full_if_years - full_if_years.mean()
xshrink = 0.1
yshrink = 0.1
xremapscale = 0.75
xlabelcutoff = 15 ** xremapscale
fig = plt.figure(figsize=(8.5, 7))
plottbl = gtbl[gtbl['recent_if'] > 4]
for rowi, row in plottbl.iterrows():
# alignment positions of this subplot
xcenter = row['recent_if'] ** xremapscale
ytop = row['trend_slope']
ifs = np.array(row[full_if_labels])
nmissing = sum(list(map(np.isnan, ifs)))
yoffsets = np.array(ifs / row['recent_if']) * yshrink
yheight = yoffsets.max() - yoffsets.min()
yoffsets = yoffsets - yoffsets.min() - yheight / 2
ypoints = (yoffsets + ytop)[nmissing:]
xpoints = (years_zero_centered * xshrink + xcenter)[nmissing:]
plt.plot(xpoints, ypoints, c='black', alpha=0.7)
if xcenter > xlabelcutoff or abs(ytop) > 0.2:
plt.annotate(row['Abbreviated Journal Title'].title(),
(xpoints[-1], ypoints[-1]))
plt.axhline(0, c='black', lw=1)
plt.ylim(-0.8, 0.8)
xtickpositions = np.array([5, 10, 15, 20, 25, 30, 35, 40, 45])
plt.xticks(xtickpositions ** xremapscale, list(map(str, xtickpositions)))
plt.grid(True, ls='-', alpha=0.3)
plt.xlabel('Mean Impact Factor 2011-2013')
plt.ylabel('IF Change Trend 2011-2013')
plt.title('All Journals Related to Molecular & Cell Biology')
plt.savefig(DROPBOXHOME+'/Data/2014/jcr-trend-all.pdf')