import requests
import pandas as pd
import numpy as np
import scattertext as st
import spacy
import time
from IPython.display import IFrame
from IPython.core.display import display, HTML
from bokeh.palettes import PuBu
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, ranges, LabelSet
from bokeh.plotting import figure
output_notebook()
display(HTML("<style>.container { width:98% !important; }</style>"))
%matplotlib inline
assert [int(x) for x in st.__version__.split('.')] >=[0,0,2,20]
url = 'https://openreview.net/notes?invitation=ICLR.cc%2F2018%2FConference%2F-%2FBlind_Submission&offset=0&limit=1000'
df = pd.DataFrame(requests.get(url).json()['notes'])
forum_content = []
for i, forum_id in list(enumerate(df.forum)):
notes_url = 'https://openreview.net/notes?forum={}&trash=true'.format(forum_id)
try:
forum_content.append(requests.get(notes_url).json())
except:
print('err', i, forum_id)
forum_content = {}
time.sleep(.3)
df['forumContent'] = pd.Series(forum_content)
df.to_csv('iclr2018_raw.csv.bz2', index=False, compression='bz2')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-2b18eea99cb5> in <module>() 1 forum_content = [] ----> 2 for i, forum_id in list(enumerate(df.forum)): 3 notes_url = 'https://openreview.net/notes?forum={}&trash=true'.format(forum_id) 4 try: 5 forum_content.append(requests.get(notes_url).json()) NameError: name 'df' is not defined
#
# Code here is to read locally:
read_local = True
if read_local:
df = pd.read_csv('iclr2018_raw.csv.bz2')
df['forumContent'] = df.forumContent.apply(eval) # totally unsafe
df['content'] = df.content.apply(eval)
df['decision_raw'] = df.forumContent.apply(lambda x:[n['content']['decision']
for n in x['notes']
if 'decision' in n['content']][0])
df['decision_raw'].value_counts()
Reject 504 Accept (Poster) 313 Invite to Workshop Track 90 Accept (Oral) 23 Name: decision_raw, dtype: int64
len(df)
930
df['title'] = df.content.apply(lambda x: x['title'])
df['authors'] = df.content.apply(lambda x: x['authors'])
only_reviews_df = pd.concat(df.forumContent.apply(lambda c: pd.DataFrame([
{'review': n['content']['review'],
'rating': n['content']['rating'],
'confidence': n['content']['confidence'],
'forum': n['forum']}
for n in c['notes']
if 'content' in n and 'review' in n['content']
])).tolist())
reviews_df = pd.merge(df[['title', 'authors', 'decision_raw', 'forum']], only_reviews_df, on='forum')
#reviews_df.groupby('decision_raw')['rating'].value_counts()
reviews_df['decision'] = (reviews_df['decision_raw']
.apply(lambda x: 'Reject' if x == 'Reject'
else ('Accept' if x.startswith('Accept')
else 'Workshop')))
reviews_df['rating_bin'] = (reviews_df['rating']
.apply(lambda x: (lambda s: 'Negative' if s < 5
else ('Positive' if s > 6 else 'Neutral'))
(int(x.split(':')[0].strip()))))
reviews_df['category'] = reviews_df['decision'] + ', ' + reviews_df['rating_bin']
decisions = reviews_df[['forum','decision_raw']].drop_duplicates()['decision_raw'].value_counts()
source = ColumnDataSource(dict(x=list(decisions.index),y=decisions.values))
#source = ColumnDataSource({'x': decisions.index, 'y': decisions.values}
plot = figure(plot_width=600, plot_height=300, tools="save",
x_axis_label = "Decision",
y_axis_label = "Paper Count",
title="",
x_minor_ticks=2,
x_range = source.data["x"],
y_range= ranges.Range1d(start=0,end=600))
labels = LabelSet(x='x', y='y', text='y', level='glyph',
x_offset=-13.5, y_offset=0, source=source, render_mode='canvas')
plot.vbar(source=source,x='x',top='y',bottom=0,width=0.3,color=PuBu[7][2])
plot.add_layout(labels)
show(plot)
ratings = reviews_df['rating'].value_counts()
ratings.index = [int(c.split(':')[0]) for c in ratings.index]
ratings = ratings.sort_index()
source = ColumnDataSource(dict(x=[str(x) for x in ratings.index],y=ratings.values))
plot = figure(plot_width=600, plot_height=300, tools="save",
x_axis_label = "Rating",
y_axis_label = "Review Count",
title="",
x_minor_ticks=2,
x_range = source.data["x"],
y_range= ranges.Range1d(start=0,end=ratings.max() + 100))
labels = LabelSet(x='x', y='y', text='y', level='glyph',
x_offset=-13.5, y_offset=0, source=source, render_mode='canvas')
plot.vbar(source=source,x='x',top='y',bottom=0,width=0.3,color=PuBu[7][2])
plot.add_layout(labels)
show(plot)
/Users/kesslej/anaconda3/lib/python3.5/site-packages/bokeh/core/json_encoder.py:80: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. elif np.issubdtype(type(obj), np.float):
reviews_df['metadata'] = (
reviews_df['title'] + '<br/>Score: ' + reviews_df['rating'].apply(lambda x: x.split(':')[0]) + '/10'
+ '<br/>Confidence: ' + reviews_df['confidence'].apply(lambda x: x.split(':')[0]) + '/5'
+ '<br/>Ultimate decision: ' + reviews_df['decision'].apply(lambda x: x.split(':')[0]) + '/10'
)
reviews_df.to_csv('iclr2018_reviews.csv.bz2', index=False, compression='bz2')
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')
nlp = spacy.load('en')
reviews_df['parse'] = reviews_df['review'].apply(nlp)
corpus = (st.CorpusFromParsedDocuments(reviews_df, category_col = 'rating_bin', parsed_col = 'parse')
.build()
.remove_categories(['Neutral']))
html = st.produce_scattertext_explorer(corpus,
category='Positive',
not_categories=['Negative'],
transform = st.Scalers.percentile_dense,
term_scorer = st.RankDifference(),
metadata = corpus.get_df()['metadata'])
file_name = '../jasonkessler.github.io/iclr2018reviews/pos_neg_dense.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)
6131293
four_square_corpus = (st.CorpusFromParsedDocuments(reviews_df, category_col = 'category', parsed_col = 'parse')
.build()
.get_unigram_corpus()
.compact(st.ClassPercentageCompactor(term_count=1)))
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square_axes.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)
8267592
four_square_axes = st.FourSquareAxes(four_square_corpus,
left_categories=['Accept, Positive'],
right_categories=['Accept, Negative'],
top_categories=['Reject, Positive'],
bottom_categories=['Reject, Negative'],
labels = {'a': 'Positive',
'b': 'Review that was Contrary to Accpetance Decision',
'not_a': 'Negative',
'not_b': 'Review that in Line With Acceptance Decision'},
term_ranker=st.OncePerDocFrequencyRanker)
html = st.produce_four_square_axes_explorer(
four_square_axes=four_square_axes,
x_label="Accepts: Pos-Neg",
y_label='Rejects: Neg-Pos',
use_full_doc=True,
metadata=four_square_corpus.get_df()['metadata'],
color_func='(function(d) {return d3.rgb(230, 220, 230)})',
censor_points = False,
)
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square_axes_display.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)
8205863
four_square_axes = st.FourSquareAxes(four_square_corpus,
left_categories=['Accept, Positive'],
right_categories=['Accept, Negative'],
top_categories=['Reject, Positive'],
bottom_categories=['Reject, Negative'],
labels = {'a': 'Positive',
'b': 'Review that was Contrary to Accpetance Decision',
'not_a': 'Negative',
'not_b': 'Review that in Line With Acceptance Decision'},
term_ranker=st.OncePerDocFrequencyRanker)
html = st.produce_four_square_axes_explorer(
four_square_axes=four_square_axes,
x_label="Accepts: Pos-Neg",
y_label='Rejects: Neg-Pos',
use_full_doc=True,
metadata=four_square_corpus.get_df()['metadata'],
color_func='(function(d) {return d3.rgb(230, 220, 230)})',
)
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square_axes_interactive.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)
8205862
four_square= st.FourSquare(four_square_corpus,
category_a_list=['Accept, Positive'],
category_b_list=['Accept, Negative'],
not_category_b_list=['Reject, Positive'],
not_category_a_list=['Reject, Negative'],
labels = {'a_and_b': 'Accept',
'not_a_and_not_b': 'Reject',
'a_and_not_b': 'Positive',
'b_and_not_a': 'Negative'},
term_ranker=st.OncePerDocFrequencyRanker)
html = st.produce_four_square_explorer(
four_square=four_square,
y_label='Accept-Reject',
x_label='Positive-Negative',
use_full_doc=True,
metadata = four_square_corpus.get_df()['metadata'],
)
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)
8206300
#corpus = corpus.remove_infrequent_words(5)
t0 = time.time()
compact_corpus = st.CompactTerms(corpus, st.OncePerDocFrequencyRanker, 5).compact()
print(time.time() - t0)
29.19629192352295
fine_grain_corpus = (st.CorpusFromParsedDocuments(reviews_df, category_col='category', parsed_col='parse').build())
fine_grain_corpus.get_categories()
['Reject, Negative', 'Reject, Neutral', 'Accept, Negative', 'Accept, Positive', 'Reject, Positive', 'Workshop, Neutral', 'Accept, Neutral', 'Workshop, Negative', 'Workshop, Positive']
fine_grain_corpus_compact = st.CompactTerms(fine_grain_corpus, st.OncePerDocFrequencyRanker, 5).compact()
len(fine_grain_corpus_compact.get_terms()), len(fine_grain_corpus.get_terms())
(31640, 307829)
tdf = st.OncePerDocFrequencyRanker(fine_grain_corpus).get_ranks()
ap_vs_rp = st.RankDifference().get_scores(tdf['Accept, Positive freq'], tdf['Reject, Positive freq'])
print(terms.iloc[:10].index)
print(terms.iloc[-10:].index)
Index(['case for', 'evaluating', 'closer', 'closer to', 'machines', 'applications', 'e.g. the', 'node', 'doing', 'are of'], dtype='object', name='term') Index(['between', 'way', 'only', 'first', '/', 'method', 'given', 'about', 'to see', 'see'], dtype='object', name='term')
an_vs_rn = st.RankDifference().get_scores(tdf['Reject, Positive freq'], tdf['Accept, Positive freq'])
print(terms.iloc[:10].index)
print(terms.iloc[-10:].index)
Index(['here the', 'observations', 'authors show', 'valuable', 'find that', 'it ’s', 'from table', 'method which', 'put', 'the process'], dtype='object', name='term') Index(['model', 'no', 'for the', 'new', 'neural', 'are not', 'dataset', 'these', 'about', 'network'], dtype='object', name='term')
four_square = st.FourSquare(fine_grain_corpus_compact,
['Accept, Positive'],
['Reject, Positive'],
['Accept, Negative'],
['Reject, Negative'],
term_ranker=st.OncePerDocFrequencyRanker,
scorer = st.RankDifference())
html = st.produce_four_square_explorer(four_square=four_square,
x_label='Pos-Neg',
y_label='Accept-Reject',
num_terms_semiotic_square=10,
minimum_term_frequency=10,
pmi_threshold_coefficient=10,
term_ranker=st.OncePerDocFrequencyRanker,
metadata=(fine_grain_corpus_compact._df['category'] + ': '
+ fine_grain_corpus_compact._df.rating + ', '
+ fine_grain_corpus_compact._df['title']))
file_name = 'four_square.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)
axes = four_square.get_axes()
axes.sort_values(by='x')
x | y | counts | |
---|---|---|---|
term | |||
not well | -0.060523 | -0.060523 | 23 |
observations | -0.056245 | -0.056245 | 52 |
case for | -0.054141 | -0.054141 | 14 |
it ’s | -0.053300 | -0.053300 | 40 |
doing | -0.046216 | -0.046216 | 53 |
networks as | -0.043832 | -0.043832 | 15 |
here the | -0.043551 | -0.043551 | 19 |
be the | -0.042359 | -0.042359 | 62 |
from table | -0.040466 | -0.040466 | 11 |
natural language | -0.040466 | -0.040466 | 13 |
observed | -0.039624 | -0.039624 | 39 |
generated data | -0.037240 | -0.037240 | 7 |
unable to | -0.037170 | -0.037170 | 8 |
ensemble | -0.037170 | -0.037170 | 12 |
unclear whether | -0.037170 | -0.037170 | 8 |
are pretty | -0.037170 | -0.037170 | 8 |
for improving | -0.037029 | -0.037029 | 14 |
vanilla | -0.036959 | -0.036959 | 19 |
are just | -0.036889 | -0.036889 | 18 |
evaluating | -0.036889 | -0.036889 | 24 |
besides | -0.036819 | -0.036819 | 23 |
benefit of | -0.036538 | -0.036538 | 31 |
properties of | -0.036398 | -0.036398 | 37 |
to add | -0.036328 | -0.036328 | 42 |
properties | -0.035697 | -0.035697 | 63 |
are of | -0.033943 | -0.033943 | 6 |
machines | -0.033803 | -0.033803 | 12 |
observations and | -0.033803 | -0.033803 | 8 |
closer to | -0.033803 | -0.033803 | 12 |
even the | -0.033663 | -0.033663 | 10 |
... | ... | ... | ... |
tasks | 0.201908 | 0.201908 | 152 |
clearly | 0.202819 | 0.202819 | 192 |
's | 0.203100 | 0.203100 | 194 |
their | 0.204573 | 0.204573 | 282 |
interesting | 0.205695 | 0.205695 | 371 |
easy to | 0.211235 | 0.211235 | 118 |
bit | 0.211866 | 0.211866 | 143 |
4 | 0.214181 | 0.214181 | 236 |
network | 0.214391 | 0.214391 | 265 |
loss | 0.215092 | 0.215092 | 137 |
i am | 0.219020 | 0.219020 | 150 |
am | 0.219160 | 0.219160 | 155 |
you | 0.223648 | 0.223648 | 203 |
further | 0.224770 | 0.224770 | 117 |
particular | 0.225682 | 0.225682 | 141 |
easy | 0.231994 | 0.231994 | 130 |
makes | 0.232274 | 0.232274 | 140 |
previous | 0.232555 | 0.232555 | 153 |
given | 0.237674 | 0.237674 | 216 |
is well | 0.239287 | 0.239287 | 153 |
both | 0.240830 | 0.240830 | 215 |
way | 0.243986 | 0.243986 | 197 |
new | 0.244828 | 0.244828 | 247 |
novel | 0.247002 | 0.247002 | 188 |
first | 0.247984 | 0.247984 | 226 |
well written | 0.253945 | 0.253945 | 200 |
/ | 0.258363 | 0.258363 | 232 |
written | 0.279473 | 0.279473 | 290 |
about | 0.319798 | 0.319798 | 249 |
to see | 0.328775 | 0.328775 | 185 |
31640 rows × 3 columns
tdf = corpus.get_term_freq_df()
tdf['sfs'] = ScaledFScorePresets(beta = 1).get_scores(tdf['Reject freq'], tdf['Accept freq'])
tdf['sfs_p'] = ScaledFScorePresets(beta = 1, priors=priors).get_scores(tdf['Reject freq'], tdf['Accept freq'])
tdf['loridp'] = st.LogOddsRatioInformativeDirichletPrior(priors, reviews_df.parse.apply(len).mean(), 'word').get_scores(tdf['Reject freq'], tdf['Accept freq'])
tdf['rankdiff'] = st.RankDifference().get_scores(tdf['Reject freq'], tdf['Accept freq'])
pd.DataFrame(
{s:tdf.sort_values(by=s, ascending=False).iloc[::].index
for s in ['sfs', 'sfs_p', 'loridp', 'rankdiff']
}
).iloc[:10]
loridp | rankdiff | sfs | sfs_p | |
---|---|---|---|---|
0 | _ | _ _ | _ _ | _ _ |
1 | $ $ | _ | _ | _ |
2 | dialog | novelty | time series | time series |
3 | medical | i do | autoencoder | autoencoder |
4 | word2vec | layers | series | reconstruction |
5 | mutual | graph | connections | series |
6 | mutual information | limited | reconstruction | $ $ |
7 | _ _ | claim | novelty | connections |
8 | mi | class | $ $ | classes |
9 | auto encoders | is no | classes | novelty |
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')
reviews_df['parse'] = reviews_df['review'].apply(spacy.load('en', parser=False))
# Create Corpus based on accept/reject/workshop decision
# A two-category corpus to use for plotting, with unigrams which only occur in bigrams removed.
# Terms used in <5 documents are removed as well.
full_corpus = (
st.CorpusFromParsedDocuments(reviews_df, category_col='decision', parsed_col='parse')
.build().remove_categories(['Workshop'])
.compact(st.CompactTerms(st.TermCompactor, minimum_term_count=6))
)
# Use counts of unigrams and bigrams from the Workshop corpus as the Dirichlet prior
priors = (st.PriorFactory(full_corpus, term_ranker=st.OncePerDocFrequencyRanker)
.use_categories(['Workshop'].align_to_target(corpus).get_priors()))
term_scorer = LogOddsRatioInformativeDirichletPrior(
priors, reviews_df.parse.apply(len).mean(), 'word') # use the original approach to scaling prior
html = st.produce_frequency_explorer(corpus,
category='Accept', not_categories=['Reject'],
term_ranker = st.OncePerDocFrequencyRanker,
term_scorer = term_scorer,
grey_threshold = 1.96,
metadata = corpus.get_df()['metadata'])
file_name = 'accept_reject_loridp.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)
6131293
html = st.produce_frequency_explorer(compact_corpus,
category='Accept',
not_categories=['Reject'],
term_ranker = st.OncePerDocFrequencyRanker,
term_scorer = st.RankDifference(),
grey_threshold = 0,
metadata = (corpus._df['title']
+ '<br/>Score: ' + corpus._df['rating'].apply(lambda x: x.split(':')[0]) + '/10'
+ '<br/>Confidence: ' + corpus._df['confidence'].apply(lambda x: x.split(':')[0]) + '/5'))
file_name = 'accept_reject_rankdiff.html'
open(file_name, 'wb').write(html.encode('utf-8'))
8424305
four_square_corpus_phrases = (st.CorpusFromParsedDocuments(reviews_df, category_col = 'category', parsed_col = 'parse',
feats_from_spacy_doc=st.PhraseMachinePhrases())
.build().compact(st.ClassPercentageCompactor(term_count=1)))
four_square_axes = st.FourSquareAxes(four_square_corpus_phrases,
left_categories=['Accept, Positive'],
right_categories=['Accept, Negative'],
top_categories=['Reject, Positive'],
bottom_categories=['Reject, Negative'],
labels = {'a': 'Positive',
'b': 'Review that was Contrary to Accpetance Decision',
'not_a': 'Negative',
'not_b': 'Review that in Line With Acceptance Decision'},
term_ranker=st.OncePerDocFrequencyRanker)
html = st.produce_four_square_axes_explorer(
four_square_axes=four_square_axes,
x_label="Accepts: Pos-Neg",
y_label='Rejects: Neg-Pos',
use_full_doc=True,
pmi_threshold_coefficient=0,
censor_points=False,
metadata=four_square_corpus_phrases.get_df()['metadata'],
color_func='(function(d) {return d3.rgb(230, 220, 230)})',
)
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square_axes_phrases.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)
7409359