%load_ext autoreload
%autoreload 2
%store -r the_page
if 'the_page' not in locals():
import pickle
print("Loading default data...")
the_page = pickle.load(open("data/the_page.p",'rb'))
from IPython.display import display, Markdown as md
display(md("---"))
display(md(f"# A. Actions: insertions and deletions"))
display(md(f" Provided by the [WikiWho API](https://www.wikiwho.net/en/api/v1.0.0-beta/)"))
display(md(f"***Page: {the_page['title']}***"))
display(md(f"The [WikiWho API](https://www.wikiwho.net/en/api/v1.0.0-beta/) tracks every token on a "
"Wikipedia page. The underlying WikiWho algorithm is able to distinguis every token on the "
'document even when the token appears several times, e.g. the token "the" appears multiple times'
"and yet WikiWho tracks them singularly (95% accuracy)."
))
display(md("Give a uniquely tracked token, there are two possible actions that can perfomed: i.e. **insertions** "
'and **deletions** (a character change in a word, e.g. "color" -> "colour", is modeled as deletion of '
'"color" and the insertion of "colour", two separate tokens). An **insertion** is also considered '
"a **re-insertion** if the insertion has occured before; the only insertion of a token that is not "
"a re-insertion is the first one. Similarly, a **deletion** is also considered a **re-deletion** if "
"the deletion has occured before."))
display(md("Formally, the token history can represented by a time-ordered sequence of actions "
"*(a<sub>0</sub>, ..., a<sub>n</sub>)*; note that *a<sub>0+2i</sub>* is always an insertion and"
"*a<sub>1+2i</sub>* is always a deletion for i ∈ ℕ."))
display(md("---\n***IMPORTANT:*** For articles with long revisions history, the process can take a long time. "
"Please give the some time to load (see cog wheel symbol right of 'edit app') "
"before interacting with the controls too often!"))
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload no stored variable the_page
Provided by the WikiWho API
*Page: The Camp of the Saints*
The WikiWho API tracks every token on a Wikipedia page. The underlying WikiWho algorithm is able to distinguis every token on the document even when the token appears several times, e.g. the token "the" appears multiple timesand yet WikiWho tracks them singularly (95% accuracy).
Give a uniquely tracked token, there are two possible actions that can perfomed: i.e. insertions and deletions (a character change in a word, e.g. "color" -> "colour", is modeled as deletion of "color" and the insertion of "colour", two separate tokens). An insertion is also considered a re-insertion if the insertion has occured before; the only insertion of a token that is not a re-insertion is the first one. Similarly, a deletion is also considered a re-deletion if the deletion has occured before.
Formally, the token history can represented by a time-ordered sequence of actions (a0, ..., an); note that a0+2i is always an insertion anda1+2i is always a deletion for i ∈ ℕ.
*IMPORTANT:* For articles with long revisions history, the process can take a long time. Please give the some time to load (see cog wheel symbol right of 'edit app') before interacting with the controls too often!
from wikiwho_wrapper import WikiWho
import pandas as pd
import qgrid
# set the default max number of rows to 10 so the larger DataFrame we render don't take up to much space
qgrid.set_grid_option('maxVisibleRows', 5)
wikiwho = WikiWho(lng='en')
agg_actions = wikiwho.dv.edit_persistence(the_page.page_id)
# define total columns
total_columns = ['total', 'total_surv_48h', 'total_persistent', 'total_stopword_count']
# add columns with the total actions
agg_actions = agg_actions.join(pd.DataFrame(
agg_actions.loc[:,'adds':'adds_stopword_count'].values +\
agg_actions.loc[:,'dels':'dels_stopword_count'].values +\
agg_actions.loc[:,'reins':'reins_stopword_count'].values,
index=agg_actions.index,
columns=total_columns
))
display(md("## A.1 Total actions per month and editor"))
display(md(f"***Page: {the_page['title']}***"))
display(md("""The following table shows the total number of actions (insertions + deletions) per month
(`year_month` column), and editor (`editor_id` and `editor` columns)."""))
display(md("""**Columns description:**
- **total**: total number of actions (insertions, and deletions)
- **total**: total number of actions (insertions, and deletions)
- **total_surv_48h**: total number of actions that survived at least 48 hours
- **total_persistent**: total number of actions that survived until, at least, the end of the month
- **total_stopword_count**: total number of actions that were performed in stop words"""))
from IPython.display import clear_output
from ipywidgets import Output
# the output widget is used to update the qgrid
out = Output()
display(out)
with out:
print("Downloading editor usernames (i.e. *editor* column)...")
display(qgrid.show_grid(agg_actions[['year_month', 'editor_id'] + total_columns]))
# Grab user names from wikipedia and merge them to the editors_conflict dataframe
from external.wikipedia import WikipediaDV, WikipediaAPI
wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org'))
editors = wikipedia_dv.get_editors(agg_actions['editor_id'].unique()).rename(columns = {
'userid': 'editor_id'})
# Merge the namesof the editors to the aggregate actions dataframe
agg_actions = agg_actions.merge(editors[['editor_id', 'name']], on='editor_id')
agg_actions.insert(3, 'editor', agg_actions['name'])
agg_actions = agg_actions.drop(columns=['name'])
agg_actions['editor'] = agg_actions['editor'].fillna("Unregistered")
with out:
clear_output()
display(qgrid.show_grid(agg_actions[['year_month', 'editor_id', 'editor'] + total_columns]))
*Page: The Camp of the Saints*
The following table shows the total number of actions (insertions + deletions) per month
(year_month
column), and editor (editor_id
and editor
columns).
Columns description:
Output()
display(md("""## A.2. Visualization of actions per month"""))
display(md(f"***Page: {the_page['title']}***"))
display(md("""In the following graph you can select the *date range* and *granularity* (yearly, montly)
of the timeline (X-axis), and plot any of the follow counts in the black, red, blue and green lines:
- **adds**: number of first-time insertions
- **adds_surv_48h**: number of insertions for the first time that survived at least 48 hours
- **adds_persistent**: number of insertions for the first time that survived until, at least, the end of the month
- **adds_stopword_count**: number of insertions that were stop words
- **dels**: number of deletions
- **dels_surv_48h**: number of deletions that were not resinserted in the next 48 hours
- **dels_persistent**: number of deletions that were not resinserted until, at least, the end of the month
- **dels_stopword_count**: number of deletions that were stop words
- **reins**: number of reinsertions
- **reins_surv_48h**: number of reinsertions that survived at least 48 hours
- **reins_persistent**: number of reinsertionsthat survived until the end of the month
- **reins_stopword_count**: number of reinsertionsthat were stop words
"""))
# Convert to datetime
agg_actions['year_month'] = pd.to_datetime(agg_actions['year_month'])
# Group the data by year month and page (drop the editor information)
agg_actions.drop('editor_id', axis=1).groupby(['year_month','page_id']).sum().reset_index()
# Listener
from visualization.actions_listener import ActionsListener
listener = ActionsListener(agg_actions)
action_types = (agg_actions.columns[4:16]).values.tolist()
# Visualization
from utils.notebooks import get_date_slider_from_datetime
from ipywidgets import interact, fixed
from ipywidgets.widgets import Dropdown
interact(listener.listen,
_range = get_date_slider_from_datetime(actions['year_month']),
editor=fixed('All'),
granularity=Dropdown(options=['Yearly', 'Monthly'], value='Yearly'),
black=Dropdown(options=action_types, value='adds'),
red=Dropdown(options= ['None'] + action_types, value='dels'),
green=Dropdown(options= ['None'] + action_types, value='None'),
blue=Dropdown(options= ['None'] + action_types, value='None'))
*Page: The Camp of the Saints*
In the following graph you can select the date range and granularity (yearly, montly) of the timeline (X-axis), and plot any of the follow counts in the black, red, blue and green lines:
interactive(children=(SelectionRangeSlider(continuous_update=False, description='Date Range', index=(0, 111), …
<function ipywidgets.widgets.interaction._InteractFactory.__call__.<locals>.<lambda>(*args, **kwargs)>
from IPython.display import display, Markdown as md
display(md("---"))
display(md(f'## A.3 Page metrics'))
display(md(f"***Page: {the_page['title']}***"))
display(md("""The term **conflict** is in detail explained in the section B of this notebook.
The total conflict of a page is the sum of all the conflict scores of all actions with
conflict (or conflict actions). This can be normalized if the sum is divided by the number of
actions that can potentially enter into conflict (elegible actions, i.e actions that have occurred at
least twice).
In the following graph you can select the *date range* and *granularity* (yearly, montly)
of the timeline (X-axis), and plot any of the following counts in the black and red lines:
- **Total**: total number of actions (insertions, and deletions)
- **Total_surv_48h**: total number of actions that survived at least 48 hours
- **Total_persistent**: total number of actions that survived until, at least, the end of the month
- **Total_stopword_count**: total number of actions that were performed in stop words
- **Total Elegible Actions**: the total number of elegible actions
- **Number of Conflicts**: the total number of conflicts
- **Number of Revisions**: the total number of conflicts
- **Conflict Score**: the sum of conflict scores of all actions divided by the number of elegible actions
- **Absolute Conflict Score**: the sum of conflict scores of all actions (without division)
- **Conflict Ratio**: the count of all conflicts divided by the number of elegible actions
"""))
*Page: The Camp of the Saints*
The term conflict is in detail explained in the section B of this notebook. The total conflict of a page is the sum of all the conflict scores of all actions with conflict (or conflict actions). This can be normalized if the sum is divided by the number of actions that can potentially enter into conflict (elegible actions, i.e actions that have occurred at least twice).
In the following graph you can select the date range and granularity (yearly, montly) of the timeline (X-axis), and plot any of the following counts in the black and red lines:
# Visualization
from visualization.conflicts_listener import ConflictsListener
listener = ConflictsListener(agg_actions)
metrics = ['Total', 'Total_surv_48h', 'Total_persistent', 'Total_stopword_count',
'Total Elegible Actions', 'Number of Conflicts', 'Number of Revisions',
'Conflict Score', 'Absolute Conflict Score', 'Conflict Ratio']
conflict_score = agg_actions.conflict.sum() / agg_actions.elegibles.sum()
display(md(f'**Page conflict score: {conflict_score}**'))
# Visualization
from utils.notebooks import get_date_slider_from_datetime
from ipywidgets import interact
from ipywidgets.widgets import Dropdown
if (calculator.get_page_conflict_score() != 0):
interact(listener.listen,
_range = get_date_slider_from_datetime(elegible_actions['rev_time']),
granularity=Dropdown(options=['Yearly', 'Monthly', 'Daily'], value='Monthly'),
black=Dropdown(options=metrics, value='Conflict Score'),
red=Dropdown(options= ['None'] + metrics, value='None'))
Page conflict score: 0.8526734147486141
interactive(children=(SelectionRangeSlider(continuous_update=False, description='Date Range', index=(0, 62), l…
from IPython.display import display, Markdown as md
display(md("---"))
display(md(f'## A.4 Editor Conflict Score'))
display(md(f"***Page: {the_page['title']}***"))
display(md("""Similar to the previous idea, it is possible to calculate the editor conflict score. The
table below presents the conflict score and other related metrics per editor (*editor_id* and *editor*
column):
- **conflicts**: the total number of conflicts
- **elegibles**: the total number of elegible actions performed by the editor
- **conflict**: the sum of conflict scores of all actions divided by the number of elegible actions
"""))
*Page: The Camp of the Saints*
Similar to the previous idea, it is possible to calculate the editor conflict score. The table below presents the conflict score and other related metrics per editor (editor_id and editor column):
editors_conflicts = agg_actions.groupby(pd.Grouper(
key='editor_id')).agg({'conflicts': 'sum', 'elegibles': 'sum', 'conflict': 'sum'}).reset_index()
editors_conflicts['conflict'] = (editors_conflicts['conflict']/editors_conflicts['elegibles'])
if len(editors_conflicts) > 0:
editors_conflicts = editors[['editor_id', 'name']].merge(editors_conflicts.dropna(),
right_index=True, on='editor_id').set_index('editor_id')
qg_obj = qgrid.show_grid(editors_conflicts.dropna())
display(qg_obj)
else:
display(md(f'**There is no Conflict Scores**'))
editors_conflicts = None
QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…
from IPython.display import display, Markdown as md, HTML as html
display(md("---"))
display(md(f"# B. Measuring conflict"))
display(md(f"***Page: {the_page['title']}***"))
display(md("""
A measurement of ***conflict*** is provided in [Flöck et al](https://arxiv.org/abs/1703.08244). They consider
that an action (insertion or deletion) on a token `x` presents ***conflict*** if (1) it is not the first
time the action happens, i.e. it is a re-insertion, or re-deletion of `x`, and if (2) the re-insertion or
re-deletion is not a contiguous undo performed by the same editor, e.g. editor `a` inserted `x`, and
then deleted it. For the last rule (2), only contiguous actions for `x` are considered, meaning that
no other editors performed actions on `x` between the do and undo of editor `a`. """ ))
display(html("""Formally, an action <i>a<sub>i</sub></i> (insertion or deletion)
on a token x written by editor <i>A</i> is in conflict in revision <i>r</i> if<br /><ol>
<li><b>there exists an action <i>a<sub>i-2</sub></i> </b>, note that <i>a<sub>i</sub> is equal to
a<sub>i-2</sub></i>, i.e. it is a reinsertion or redeletion, because insertions and deletions
must alternate, and if</li>
<li><b> the previous action action <i>a<sub>i-1</sub></i> was not performed by editor <i>A</i></b>, i.e.
it is not and undo performed by the same action</li>
</ol>
The next section will present the Conflict Score of each action in the revision.
"""))
*Page: The Camp of the Saints*
A measurement of *conflict* is provided in Flöck et al. They consider
that an action (insertion or deletion) on a token x
presents *conflict* if (1) it is not the first
time the action happens, i.e. it is a re-insertion, or re-deletion of x
, and if (2) the re-insertion or
re-deletion is not a contiguous undo performed by the same editor, e.g. editor a
inserted x
, and
then deleted it. For the last rule (2), only contiguous actions for x
are considered, meaning that
no other editors performed actions on x
between the do and undo of editor a
.
# create the api
from wikiwho_wrapper import WikiWho
wikiwho = WikiWho(lng='en')
from IPython.display import display, Markdown as md
# Get the content and revisions from the wikiwho api
display(md("Downloading all_content from the WikiWhoApi..."))
all_content = wikiwho.dv.all_content(the_page['page_id'])
display(md("Downloading revisions from the WikiWhoApi..."))
revisions = wikiwho.dv.rev_ids_of_article(the_page['page_id'])
from IPython.display import clear_output
clear_output()
from metrics.conflict import ConflictManager
from wikiwho_wrapper import WikiWho
from IPython.display import clear_output
from IPython.display import HTML
from utils.notebooks import get_next_notebook, get_previous_notebook
# call the calculator
calculator = ConflictManager(all_content, revisions)
calculator.calculate()
clear_output()
# display the tokens, the difference in seconds and its corresponding conflict score
conflicts = calculator.conflicts.copy()
conflicts['time_diff_secs'] = conflicts['time_diff'].dt.total_seconds()
display(md(f'## B.1 Conflict score of each singular action'))
display(md(f"***Page: {the_page['title']}***"))
display(md("""In the following table, all the actions that are in conflict are presented, and a conflict score
is presented per action. The importance of the conflict can be meassure by considering the seconds t that
have passed since the last action on the same token has occured (`time_diff_secs` column). A score to
meassure conflict is calculated based on t with the following formula: 1 / log<sub>3600</sub>(t+2).
Thus, *undo* actions are weighted higher than the original time in seconds when the *t* is less than an hour.
For details, please refer to [Flöck et al, 2017](https://arxiv.org/abs/1703.08244).
**Columns description:**
- **token**: the string of the token that is being tracked
- **token_id**: the id of the token that is being tracked
- **rev_id**: the revision id in which the action (insertion or deletion) happen
- **editor_id**: the id of the editor that inserted the token (if starts with **0|**, it means that
the editor is not registered, and the ip is displayed instead
- **time_diff_secs**: seconds that have passed since the last action on the same token has occured
- **conflict**: a score to meassure conflict that is calculated based on the `time_diff_secs`
with the following formula: *1 / log<sub>3600</sub>(time_diff_secs + 2)*. For details, please refer to
[Flöck et al, 2017](https://arxiv.org/abs/1703.08244)"""))
if len(conflicts) > 0:
display(qgrid.show_grid(conflicts[[
'action', 'token', 'token_id', 'rev_id',
'editor', 'time_diff_secs', 'conflict']].rename(columns={
'editor': 'editor_id'}).sort_values('conflict', ascending=False)))
else:
display(md(f'**There are no conflicting tokes in this page.**'))
display(HTML(f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>'))
*Page: The Camp of the Saints*
In the following table, all the actions that are in conflict are presented, and a conflict score
is presented per action. The importance of the conflict can be meassure by considering the seconds t that
have passed since the last action on the same token has occured (time_diff_secs
column). A score to
meassure conflict is calculated based on t with the following formula: 1 / log3600(t+2).
Thus, undo actions are weighted higher than the original time in seconds when the t is less than an hour.
For details, please refer to Flöck et al, 2017.
Columns description:
the editor is not registered, and the ip is displayed instead
time_diff_secs
with the following formula: 1 / log3600(time_diff_secs + 2). For details, please refer to Flöck et al, 2017
QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…
from IPython.display import display, Markdown as md, HTML as html
display(md("---"))
display(md(f'## B.2 Most frequent conflicting token strings'))
display(md(f"***Page: {the_page['title']}***"))
display(md(""" The WordCloud displays the most common conflicting token strings, i.e. words (token strings)
with the most actions that have conflict. The size of the token string in the WordCloud indicates frequency
of actions.
In the controls you can select the *date range*, the type of *action* (insertion or deletion), and the
*source*. The *source* can be any of the following:
- **Only Conflicts**: use only the actions that are in conflict.
- **Elegible Actions**: use only the actions that can potentially enter into conflict, i.e. actions
that have occurred at least twice, e.g. the token x has been inserted twice (which necessarily implies
it was remove once), the token x has been deleted twice (which necessarily implies it was inserted twice)
- **All Actions**: use all tokens regardles conflict
"""))
*Page: The Camp of the Saints*
The WordCloud displays the most common conflicting token strings, i.e. words (token strings) with the most actions that have conflict. The size of the token string in the WordCloud indicates frequency of actions. In the controls you can select the date range, the type of action (insertion or deletion), and the source. The source can be any of the following:
that have occurred at least twice, e.g. the token x has been inserted twice (which necessarily implies it was remove once), the token x has been deleted twice (which necessarily implies it was inserted twice)
# listener
from visualization.wordcloud_listener import WCListener
listener = WCListener(sources = {
'All actions': calculator.all_actions,
'Elegible Actions': calculator.elegible_actions,
'Only Conflicts': calculator.conflicts
})
# visualization
from utils.notebooks import get_date_slider_from_datetime
from ipywidgets import interact, fixed
from ipywidgets.widgets import Dropdown, HTML, interactive_output, VBox
_range=get_date_slider_from_datetime(calculator.all_actions['rev_time'])
source=Dropdown(options=list(listener.sources.keys()), value='Only Conflicts', description='Source (*)')
action=Dropdown(options=['Both', 'Just Insertions', 'Just Deletions'], value='Both', description='Action')
editor=fixed('All')
out = interactive_output(listener.listen, {
'_range': _range,
'source': source,
'action': action,
'editor': editor})
display(VBox([_range, action, source, out]))
VBox(children=(SelectionRangeSlider(continuous_update=False, description='Date Range', index=(0, 218), layout=…
from IPython.display import HTML
from utils.notebooks import get_next_notebook, get_previous_notebook
%store agg_actions
%store calculator
%store editors_conflicts
clear_output()
if len(editors_conflicts) > 0:
display(HTML(f'<a href="{get_next_notebook()}" target="_blank">Go to next workbook</a>'))
else:
display(HTML(f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>'))
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) ~/wikiwho_demo/utils/notebooks.py in get_next_notebook() 53 try: ---> 54 _id = int(notebook_name()[0]) + 1 55 except: TypeError: 'NoneType' object is not subscriptable During handling of the above exception, another exception occurred: TypeError Traceback (most recent call last) <ipython-input-137-13b75834a644> in <module> 10 11 if len(editors_conflicts) > 0: ---> 12 display(HTML(f'<a href="{get_next_notebook()}" target="_blank">Go to next workbook</a>')) 13 else: 14 display(HTML(f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>')) ~/wikiwho_demo/utils/notebooks.py in get_next_notebook() 54 _id = int(notebook_name()[0]) + 1 55 except: ---> 56 _id = int(notebook_name()[1]) + 1 57 return glob.glob(f"{_id}*.ipynb")[0] 58 TypeError: 'NoneType' object is not subscriptable