import sys import cPickle as pickle import gzip from datetime import date import numpy as np from matplotlib import pylab as pl import matplotlib home = !(echo $HOME) home = home[0] plos_dat = home+'/private/plos_one/editors_plos_one_dates.dat' dat_file = gzip.open(plos_dat, 'rb') dat = pickle.load(dat_file) dat_file.close() durations = [] durations_annual = {} review_time_annual = {} publication_time_annual = {} editors_annual = {} received = [] published = [] for key in dat.keys(): dur = [] for pub in dat[key]: dt = date(*pub['publication_date'])-date(*pub['received_date']) dur.append(dt.days) if date(*pub['received_date']).year in durations_annual.keys(): durations_annual[date(*pub['received_date']).year].append(dt.days) else: durations_annual[date(*pub['received_date']).year] = [dt.days] dt = date(*pub['accepted_date'])-date(*pub['received_date']) if date(*pub['received_date']).year in review_time_annual.keys(): review_time_annual[date(*pub['received_date']).year].append(dt.days) else: review_time_annual[date(*pub['received_date']).year] = [dt.days] dt = date(*pub['publication_date'])-date(*pub['accepted_date']) if date(*pub['accepted_date']).year in publication_time_annual.keys(): publication_time_annual[date(*pub['accepted_date']).year].append(dt.days) else: publication_time_annual[date(*pub['accepted_date']).year] = [dt.days] if date(*pub['received_date']).year in editors_annual.keys(): editors_annual[date(*pub['received_date']).year].append(key) else: editors_annual[date(*pub['received_date']).year] = [key] received.append(date(*pub['received_date'])) published.append(date(*pub['publication_date'])) durations.append(dur) earliest_submission = min(received) latest_submission = max(received) earliest_publication = min(published) latest_publication = max(published) no_submissions = len(received) print 'number of submissios',no_submissions print 'earliest submission',earliest_submission print 'latest submission',latest_submission print 'earliest publication',earliest_publication print 'latest publication',latest_publication for key in dat.keys(): for pub in dat[key]: if date(*pub['publication_date']) == earliest_publication: print 'earliest publication',pub['id'] if date(*pub['received_date']) == earliest_submission: print 'earliest submission',pub['id'] if date(*pub['publication_date']) == latest_publication: print 'latest publication',pub['id'] if date(*pub['received_date']) == latest_submission: print 'latest submission',pub['id'] no_annual_publ = {} no_annual_recv = {} for pub in published: if pub.year in no_annual_publ.keys(): no_annual_publ[pub.year]+=1 else: no_annual_publ[pub.year] = 1 for pub in received: if pub.year in no_annual_recv.keys(): no_annual_recv[pub.year]+=1 else: no_annual_recv[pub.year] = 1 print 'annual received',no_annual_recv print 'annual published',no_annual_publ fig, ax = pl.subplots() ax.plot(no_annual_recv.keys(), no_annual_recv.values(),'o',color='blue') ax.plot(no_annual_publ.keys(), no_annual_publ.values(),'o',color='red') ax.ticklabel_format(useOffset=False) ax.set_xlim(2005,2014) ax.set_ylim(-500,22000) ax.set_xlabel('year') ax.set_ylabel('no. of articles') pl.show() fig, ax = pl.subplots() da_median = [np.median(durations_annual[key]) for key in durations_annual.keys()] ax.plot(durations_annual.keys(), da_median, 'o', color='blue') ax.ticklabel_format(useOffset=False) ax.set_xlim(2005,2014) ax.set_xlabel('year') ax.set_ylabel('median total time to publication / days') pl.show() fig, ax = pl.subplots() rta_median = [np.median(review_time_annual[key]) for key in review_time_annual.keys()] pta_median = [np.median(publication_time_annual[key]) for key in publication_time_annual.keys()] ax.plot(publication_time_annual.keys(), pta_median, 'o', color='red', markersize=10) ax.plot(review_time_annual.keys(), rta_median, 'o', color='blue') ax.ticklabel_format(useOffset=False) ax.set_xlim(2005,2014) ax.set_ylim(20, 120) ax.set_xlabel('year') ax.set_ylabel('median time / days') pl.show() rta_median ea = [len(set(editors_annual[key])) for key in editors_annual.keys()] ea fig, ax = pl.subplots() ed_recv_ratio = [float(recv)/float(ed) for recv, ed in zip(no_annual_recv.values(), ea)] ax.plot(editors_annual.keys(), ed_recv_ratio, 'o', color='red') ax.ticklabel_format(useOffset=False) ax.set_xlim(2005,2014) ax.set_xlabel('year') ax.set_ylabel('no. received articles / editors') pl.show() ed_recv_ratio print durations[0] print durations[2] n, bins, patches = pl.hist([np.median(ed) for ed in durations], normed=True, bins=20) pl.xlabel('median total time to publication / days') pl.ylabel('frequency') pl.show() print sorted([(count, tt) for count, tt in zip(n,bins)], key=lambda x: x[0], reverse=True) pl.scatter(range(len(durations)), [np.median(ed) for ed in durations]) pl.xlabel('editor') pl.ylabel('median total time to publication / days') pl.show() print [len(ed) for ed in durations if np.median(ed) >= 300] print 'number of editors with median >= 300 and one submission', len([ed for ed in durations if np.median(ed) >= 300 and len(ed)==1]) print 'number of editors with median >= 300 and more than one submission', len([ed for ed in durations if np.median(ed) >= 300 and len(ed)>1]) print [ed for ed in durations if np.median(ed) >= 300 and len(ed) == 8] classes = {} for ed in durations: a_class = len(ed) if a_class in classes.keys(): classes[a_class].append(ed) else: classes[a_class] = [ed] print 'least number of submissions handled to publication', min(classes.keys()) print 'greatest number of submissions handled to publication', max(classes.keys()) print len(classes[441]) bp_data = [] for cl in classes.keys(): dummy = [] for ed in classes[cl]: for val in ed: dummy.append(val) bp_data.append(dummy) fig, ax = pl.subplots() ax.boxplot(bp_data, positions=classes.keys()) ax.set_xlabel('number of submissions handled') ax.set_ylabel('total time to publication / days') ax.set_xticks(range(1,max(classes.keys())+50, 50)) pl.show() bp_data = [] for cl in classes.keys(): dummy = [] for ed in classes[cl]: for val in ed: dummy.append(val) bp_data.append(dummy) fig, ax = pl.subplots() ax.boxplot(bp_data, positions=classes.keys()) ax.set_xlabel('number of submissions handled') ax.set_ylabel('total time to publication / days') ax.set_xticks(range(1,max(classes.keys())+50, 50)) ax.set_ylim(0, 1200) pl.show() ed_id = None ed_durations = None for ed_i, ed in enumerate(durations): if len(ed) == 441: ed_id = ed_i ed_durations = ed print ed_id pl.scatter(range(len(durations[ed_id])), durations[ed_id]) pl.xlabel('submission number handled by editor 977') pl.ylabel('total time to publication / days') print 'median total time for editor 977:',np.median(durations[977]) pl.scatter(range(len(durations)), sorted([np.median(ed) for ed in durations])) pl.xlabel('editor') pl.ylabel('median total time to publication / days') pl.xlim(-100,len(durations)+100) pl.ylim(0, 600) pl.show() for cl in classes.keys(): dummy = [] for ed in durations: if len(ed) == cl: dummy.append(ed) if len(dummy) > 50 and len(dummy) < 200: pl.scatter(range(len(dummy)), sorted([np.median(ed) for ed in dummy])) print 'class',cl pl.xlabel('editor') pl.ylabel('median total time to publication / days') pl.show()