import pandas
import psycopg2
def months_between_datetimes(date_from, date_to):
"""
Compute the number of calendar months (rather than 30 day units)
between two dates.
date_from and date_to are pandas datetime series .dt attributes
See http://stackoverflow.com/a/4040338/4651668.
"""
date_from = pandas.to_datetime(date_from)
date_to = pandas.to_datetime(date_to)
if pandas.isna(date_from) or pandas.isna(date_to):
return None
delta_years = 12 * (date_to.year - date_from.year)
delta_months = date_to.month - date_from.month
return delta_years + delta_months
connection = psycopg2.connect(host='localhost', dbname="rxdb", user='postgres', port=65500)
query = '''
SELECT
id AS rxivist_preprint_id,
paper.articles.posted AS preprint_date,
paper.articles.doi AS preprint_doi,
paper.publication_dates.date AS journal_date,
LOWER(paper.article_publications.doi) AS journal_doi
FROM paper.articles
LEFT JOIN paper.article_publications
ON paper.articles.id=paper.article_publications.article
LEFT JOIN paper.publication_dates
ON paper.articles.id=paper.publication_dates.article
ORDER BY rxivist_preprint_id
;
'''
preprint_df = pandas.read_sql(sql=query, con=connection, parse_dates=['preprint_date', 'journal_date'])
assert not preprint_df.rxivist_preprint_id.duplicated().any()
preprint_df.head(4)
rxivist_preprint_id | preprint_date | preprint_doi | journal_date | journal_doi | |
---|---|---|---|---|---|
0 | 386 | 2018-05-21 | 10.1101/327015 | NaT | None |
1 | 387 | 2018-08-03 | 10.1101/384123 | 2018-11-19 | 10.1128/aac.01646-18 |
2 | 388 | 2018-08-03 | 10.1101/383109 | NaT | None |
3 | 389 | 2018-07-31 | 10.1101/381640 | 2018-11-01 | 10.1099/mgen.0.000234 |
preprint_df.to_csv('data/01.preprints.tsv', sep='\t', index=False)
print(f'''\
{len(preprint_df):,} preprints
{sum(preprint_df.journal_date.notna()):,} preprints with a journal date
{sum(preprint_df.journal_doi.notna()):,} preprints with a journal DOI
''')
37,648 preprints 15,797 preprints with a journal date 15,797 preprints with a journal DOI
query = '''
SELECT
paper.articles.id AS rxivist_preprint_id,
paper.articles.posted AS preprint_date,
paper.publication_dates.date AS journal_date,
CONCAT(paper.article_traffic.year, '-', TO_CHAR(paper.article_traffic.month, 'fm00')) AS period,
paper.article_traffic.abstract AS preprint_views,
paper.article_traffic.pdf AS preprint_downloads
FROM paper.article_traffic
JOIN paper.articles
ON paper.articles.id=paper.article_traffic.article
LEFT JOIN paper.publication_dates
ON paper.articles.id=paper.publication_dates.article
ORDER BY rxivist_preprint_id, period
;
'''
traffic_df = pandas.read_sql(sql=query, con=connection, parse_dates='preprint_date')
traffic_df.head(3)
rxivist_preprint_id | preprint_date | journal_date | period | preprint_views | preprint_downloads | |
---|---|---|---|---|---|---|
0 | 386 | 2018-05-21 | None | 2018-05 | 569 | 64 |
1 | 386 | 2018-05-21 | None | 2018-06 | 265 | 49 |
2 | 386 | 2018-05-21 | None | 2018-07 | 231 | 29 |
# Compute the number of calendar months from preprint / journal publication for each traffic period
traffic_df['months_since_preprint'] = [
months_between_datetimes(*pair) for pair in
zip(traffic_df.pop('preprint_date'), pandas.to_datetime(traffic_df.period))
]
traffic_df['months_since_journal'] = pandas.Series([
months_between_datetimes(*pair) for pair in
zip(traffic_df.pop('journal_date'), pandas.to_datetime(traffic_df.period))
], dtype='Int64')
traffic_df.head(2)
rxivist_preprint_id | period | preprint_views | preprint_downloads | months_since_preprint | months_since_journal | |
---|---|---|---|---|---|---|
0 | 386 | 2018-05 | 569 | 64 | 0 | NaN |
1 | 386 | 2018-06 | 265 | 49 | 1 | NaN |
traffic_df.to_csv('data/01.preprint-traffic.tsv.xz', sep='\t', index=False)
# Locate the Project Rephetio Preprint by DOI
rephetio_df = preprint_df.query('preprint_doi == "10.1101/087619"')
rephetio_df
rxivist_preprint_id | preprint_date | preprint_doi | journal_date | journal_doi | |
---|---|---|---|---|---|
3590 | 3976 | 2016-11-14 | 10.1101/087619 | 2017-09-22 | 10.7554/elife.26726 |
# Traffic for Project Rephetio Preprint
rephetio_df[['rxivist_preprint_id']].merge(traffic_df)
rxivist_preprint_id | period | preprint_views | preprint_downloads | months_since_preprint | months_since_journal | |
---|---|---|---|---|---|---|
0 | 3976 | 2016-11 | 1158 | 152 | 0 | -10 |
1 | 3976 | 2016-12 | 179 | 112 | 1 | -9 |
2 | 3976 | 2017-01 | 114 | 30 | 2 | -8 |
3 | 3976 | 2017-02 | 129 | 43 | 3 | -7 |
4 | 3976 | 2017-03 | 355 | 142 | 4 | -6 |
5 | 3976 | 2017-04 | 111 | 49 | 5 | -5 |
6 | 3976 | 2017-05 | 204 | 81 | 6 | -4 |
7 | 3976 | 2017-06 | 162 | 73 | 7 | -3 |
8 | 3976 | 2017-07 | 137 | 60 | 8 | -2 |
9 | 3976 | 2017-08 | 200 | 62 | 9 | -1 |
10 | 3976 | 2017-09 | 492 | 168 | 10 | 0 |
11 | 3976 | 2017-10 | 130 | 61 | 11 | 1 |
12 | 3976 | 2017-11 | 116 | 34 | 12 | 2 |
13 | 3976 | 2017-12 | 71 | 37 | 13 | 3 |
14 | 3976 | 2018-01 | 47 | 19 | 14 | 4 |
15 | 3976 | 2018-02 | 60 | 19 | 15 | 5 |
16 | 3976 | 2018-03 | 72 | 18 | 16 | 6 |
17 | 3976 | 2018-04 | 38 | 8 | 17 | 7 |
18 | 3976 | 2018-05 | 68 | 10 | 18 | 8 |
19 | 3976 | 2018-06 | 62 | 16 | 19 | 9 |
20 | 3976 | 2018-07 | 63 | 18 | 20 | 10 |
21 | 3976 | 2018-08 | 91 | 16 | 21 | 11 |
22 | 3976 | 2018-09 | 63 | 27 | 22 | 12 |
23 | 3976 | 2018-10 | 24 | 51 | 23 | 13 |
24 | 3976 | 2018-11 | 43 | 19 | 24 | 14 |