In [ ]:
%load_ext autoreload
%autoreload 2
%store -r the_page

if 'the_page' not in locals():
    import pickle
    print("Loading default data...")
    the_page = pickle.load(open("data/the_page.p",'rb'))

from IPython.display import display, Markdown as md
display(md("---"))
display(md(f"# A. Actions: insertions and deletions"))
display(md(f" Provided by the [WikiWho API](https://www.wikiwho.net/en/api/v1.0.0-beta/)"))
display(md(f"***Page: {the_page['title']}***"))
display(md(f"The [WikiWho API](https://www.wikiwho.net/en/api/v1.0.0-beta/) tracks every token on a "
           "Wikipedia page. The underlying WikiWho algorithm is able to distinguis every token on the "
           'document even when the token appears several times, e.g. the token "the" appears multiple times'
           "and yet WikiWho tracks them singularly (95% accuracy)."
          ))
display(md("Give a uniquely tracked token, there are two possible actions that can perfomed: i.e. **insertions** "
          'and **deletions** (a character change in a word, e.g. "color" -> "colour", is modeled as deletion of '
          '"color" and the insertion of "colour", two separate tokens). An **insertion** is also considered '
          "a **re-insertion** if the insertion has occured before; the only insertion of a token that is not "
          "a re-insertion is the first one. Similarly, a **deletion** is also considered a **re-deletion** if "
          "the deletion has occured before."))
display(md("Formally, the token history can represented by a time-ordered sequence of actions "
           "*(a<sub>0</sub>, ..., a<sub>n</sub>)*; note that *a<sub>0+2i</sub>* is always an insertion and"
           "*a<sub>1+2i</sub>* is always a deletion for i ∈ ℕ."))
display(md("---\n***IMPORTANT:*** For articles with long revisions history, the process can take a long time. "
           "Please give the some time to load (see cog wheel symbol right of 'edit app') "
           "before interacting with the controls too often!"))
In [ ]:
from wikiwho_wrapper import WikiWho
import pandas as pd
import qgrid
# set the default max number of rows to 10 so the larger DataFrame we render don't take up to much space 
qgrid.set_grid_option('maxVisibleRows', 5)

wikiwho = WikiWho(lng='en')
agg_actions = wikiwho.dv.edit_persistence(the_page.page_id)

# define total columns
total_columns = ['total', 'total_surv_48h', 'total_persistent', 'total_stopword_count']

# add columns with the total actions
agg_actions = agg_actions.join(pd.DataFrame(
    agg_actions.loc[:,'adds':'adds_stopword_count'].values +\
    agg_actions.loc[:,'dels':'dels_stopword_count'].values +\
    agg_actions.loc[:,'reins':'reins_stopword_count'].values, 
    index=agg_actions.index, 
    columns=total_columns
))

display(md("## A.1 Total actions per month and editor"))
display(md(f"***Page: {the_page['title']}***"))
display(md("""The following table shows the total number of actions (insertions + deletions) per month 
(`year_month` column), and editor (`editor_id` and `editor` columns)."""))
display(md("""**Columns description:**
- **total**: total number of actions (insertions, and deletions)
- **total**: total number of actions (insertions, and deletions)
- **total_surv_48h**: total number of actions that survived at least 48 hours
- **total_persistent**:  total number of actions that survived until, at least, the end of the month
- **total_stopword_count**:  total number of actions that were performed in stop words"""))

from IPython.display import clear_output
from ipywidgets import Output

# the output widget is used to update the qgrid
out = Output()
display(out)
with out:
    print("Downloading editor usernames (i.e. *editor* column)...")
    display(qgrid.show_grid(agg_actions[['year_month', 'editor_id'] + total_columns]))

# Grab user names from wikipedia and merge them to the editors_conflict dataframe
from external.wikipedia import WikipediaDV, WikipediaAPI
wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org'))
editors = wikipedia_dv.get_editors(agg_actions['editor_id'].unique()).rename(columns = {
    'userid': 'editor_id'})

# Merge the namesof the editors to the aggregate actions dataframe
agg_actions = agg_actions.merge(editors[['editor_id', 'name']], on='editor_id')
agg_actions.insert(3, 'editor', agg_actions['name'])
agg_actions = agg_actions.drop(columns=['name'])
agg_actions['editor'] = agg_actions['editor'].fillna("Unregistered")

with out:
    clear_output()
    display(qgrid.show_grid(agg_actions[['year_month', 'editor_id', 'editor'] + total_columns]))
In [ ]:
display(md("""## A.2. Visualization of actions per month"""))
display(md(f"***Page: {the_page['title']}***"))
display(md("""In the following graph you can select the *date range* and *granularity* (yearly, montly) 
of the timeline (X-axis), and plot any of the follow counts in the black, red, blue and green lines:
   
- **adds**: number of first-time insertions
- **adds_surv_48h**: number of insertions for the first time that survived at least 48 hours
- **adds_persistent**:  number of insertions for the first time that survived until, at least, the end of the month
- **adds_stopword_count**:  number of insertions that were stop words
- **dels**: number of deletions
- **dels_surv_48h**: number of deletions that were not resinserted in the next 48 hours
- **dels_persistent**: number of deletions that were not resinserted until, at least, the end of the month
- **dels_stopword_count**: number of deletions that were stop words
- **reins**: number of reinsertions
- **reins_surv_48h**: number of reinsertionsthat survived at least 48 hours
- **reins_persistent**: number of reinsertionsthat survived until the end of the month
- **reins_stopword_count**: number of reinsertionsthat were stop words
"""))

# Convert to datetime
agg_actions['year_month'] = pd.to_datetime(agg_actions['year_month'])

# Group the data by year month and page (drop the editor information)
agg_actions.drop('editor_id', axis=1).groupby(['year_month','page_id']).sum()

# Listener
from visualization.actions_listener import ActionsListener
listener = ActionsListener(agg_actions)
action_types = (agg_actions.loc[:,'total':'total_stopword_count'].columns.append(
    agg_actions.loc[:,'adds':'reins_stopword_count'].columns)).values.tolist()

# Visualization
from utils.notebooks import get_date_slider_from_datetime
from ipywidgets import interact, fixed
from ipywidgets.widgets import Dropdown

interact(listener.listen,
         _range = get_date_slider_from_datetime(agg_actions['year_month']),
         editor=fixed('All'),
         granularity=Dropdown(options=['Yearly', 'Monthly'], value='Yearly'),
         black=Dropdown(options=action_types, value='total'), 
         red=Dropdown(options= ['None'] + action_types, value='total_surv_48h'),
         green=Dropdown(options= ['None'] + action_types, value='None'), 
         blue=Dropdown(options= ['None'] + action_types, value='None'))
In [ ]:
from IPython.display import display, Markdown as md, HTML as html
display(md("---"))
display(md(f"# B. Meassuring conflict"))
display(md(f"***Page: {the_page['title']}***"))
display(md("""
A meassurement of ***conflict*** is provided in [Flöck et al](https://arxiv.org/abs/1703.08244). They consider
that an action (insertion or deletion) on a token `x` presents ***conflict*** if (1) it is not the first 
time the action happens, i.e. it is a re-insertion, or re-deletion of `x`, and if (2) the re-insertion or 
re-deletion is not a contiguous undo performed by the same editor, e.g. editor `a` inserted `x`, and 
then deleted it. For the last rule (2), only contiguous actions for `x` are considered, meaning that 
no other editors performed actions on `x` between the do and undo of editor `a`. """ ))

display(html("""Formally, an action <i>a<sub>i</sub></i> (insertion or deletion)
on a token x written by editor <i>A</i> is in conflict in revision <i>r</i> if<br /><ol>
<li><b>there exists an action <i>a<sub>i-2</sub></i> </b>, note that <i>a<sub>i</sub> is equal to 
a<sub>i-2</sub></i>, i.e. it is a reinsertion or redeletion, because insertions and deletions 
must alternate, and if</li>
<li><b> the previous action action <i>a<sub>i-1</sub></i> was not performed by editor <i>A</i></b>, i.e. 
it is not and undo performed by the same action</li>
</ol>

The next section will present the Conflict Score of each action in the revision.
"""))
In [ ]:
# create the api
from wikiwho_wrapper import WikiWho
wikiwho = WikiWho(lng='en')

from IPython.display import display, Markdown as md
# Get the content and revisions from the wikiwho api
display(md("Downloading all_content from the WikiWhoApi..."))
all_content = wikiwho.dv.all_content(the_page['page_id'])

display(md("Downloading revisions from the WikiWhoApi..."))
revisions = wikiwho.dv.rev_ids_of_article(the_page['page_id'])

from IPython.display import clear_output
clear_output()
In [ ]:
from metrics.conflict import ConflictManager
from wikiwho_wrapper import WikiWho
from IPython.display import clear_output
from IPython.display import HTML
from utils.notebooks import get_next_notebook, get_previous_notebook

# call the calculator
calculator = ConflictManager(all_content, revisions)
calculator.calculate()
clear_output()

# display the tokens, the difference in seconds and its corresponding conflict score
conflicts = calculator.conflicts.copy()
conflicts['time_diff_secs'] = conflicts['time_diff'].dt.total_seconds()
    
display(md(f'## B.1 Conflict score of each singular action'))
display(md(f"***Page: {the_page['title']}***"))
display(md("""In the following table, all the actions that are in conflict are presented, and a conflict score
is presented per action. The importance of the conflict can be meassure by considering the seconds t that 
have passed since the last action on the same token has occured (`time_diff_secs` column). A score to 
meassure conflict is calculated based on t with the following formula: 1 / log<sub>3600</sub>(t+2). 
Thus, *undo* actions are weighted higher than the original time in seconds when the *t* is less than an hour.
For details, please refer to [Flöck et al, 2017](https://arxiv.org/abs/1703.08244).
**Columns description:**
- **token**: the string of the token that is being tracked
- **token_id**: the id of the token that is being tracked
- **rev_id**: the revision id in which the action (insertion or deletion) happen
- **editor_id**: the id of the editor that inserted the token (if starts with **0|**, it means that
the editor is not registered, and the ip is displayed instead
- **time_diff_secs**: seconds that have passed since the last action on the same token has occured
- **conflict**: a score to meassure conflict that is calculated based on the `time_diff_secs` 
with the following formula: *1 / log<sub>3600</sub>(time_diff_secs + 2)*. For details, please refer to 
[Flöck et al, 2017](https://arxiv.org/abs/1703.08244)"""))

if len(conflicts) > 0:
    display(qgrid.show_grid(conflicts[[
        'action', 'token', 'token_id', 'rev_id', 
        'editor', 'time_diff_secs', 'conflict']].rename(columns={
        'editor': 'editor_id'}).sort_values('conflict', ascending=False)))
else:
    display(md(f'**There are no conflicting tokes in this page.**'))
    display(HTML(f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>'))
In [ ]:
from IPython.display import display, Markdown as md, HTML as html
display(md("---"))
display(md(f'## B.2 Most frequent conflicting token strings'))
display(md(f"***Page: {the_page['title']}***"))
                     
display(md(""" The WordCloud displays the most common conflicting token strings, i.e. words (token strings) 
with the most actions that have conflict. The size of the token string in the WordCloud indicates frequency 
of actions.
In the controls you can select the *date range*, the type of *action* (insertion or deletion), and the 
*source*. The *source* can be any of the following:
-   **Only Conflicts**: use only the actions that are in conflict.
-   **Elegible Actions**: use only the actions that can potentially enter into conflict, i.e. actions 
that have occurred at least twice, e.g. the token x has been inserted twice (which necessarily implies 
it was remove once), the token x has been deleted twice (which necessarily implies it was inserted twice) 
-   **All Actions**: use all tokens regardles conflict
"""))
In [ ]:
# listener
from visualization.wordcloud_listener import WCListener

listener = WCListener(sources = {
    'All actions': calculator.all_actions,
    'Elegible Actions': calculator.elegible_actions,
    'Only Conflicts': calculator.conflicts
})

# visualization
from utils.notebooks import get_date_slider_from_datetime
from ipywidgets import interact, fixed

from ipywidgets.widgets import Dropdown, HTML, interactive_output, VBox

_range=get_date_slider_from_datetime(calculator.all_actions['rev_time'])
source=Dropdown(options=list(listener.sources.keys()), value='Only Conflicts', description='Source (*)')
action=Dropdown(options=['Both', 'Just Insertions', 'Just Deletions'], value='Both', description='Action')
editor=fixed('All')

out = interactive_output(listener.listen, {
         '_range': _range,
         'source': source,
         'action': action,
         'editor': editor})

display(VBox([_range, action, source, out]))
In [ ]:
from IPython.display import display, Markdown as md
display(md("---"))
display(md(f'## B.3 Page Conflict Score and related metrics'))
display(md(f"***Page: {the_page['title']}***"))
display(md("""The total conflict of a page is the sum of all the conflict scores of all actions with 
conflict (or conflict actions). This can be normalized if the sum is divided by the number of 
actions that can potentially enter into conflict (elegible actions, i.e actions that have occurred at 
least twice).

In the following graph you can select the *date range* and *granularity* (yearly, montly) 
of the timeline (X-axis), and plot any of the following counts in the black and red lines:
   
- **Conflict Score**: the sum of conflict scores of all actions divided by the number of elegible actions
- **Absolute Conflict Score**: the sum of conflict scores of all actions (without division)
- **Conflict Ratio**: the count of all conflicts divided by the number of elegible actions
- **Number of Conflicts**: the total number of conflicts
- **Total Elegible Actions**: the total number of elegible actions
- **Total Conflict Time**: the sum of all the times (*time_diff_secs*) that has been taken by conflict actions
- **Total Elegible Time**: the sum of all the times (*time_diff_secs*) that has been taken by elegible actions
- **Time per Conflict Action**: average time of conflict actions
- **Time per Elegible Action**: average time of elegible actions
"""))
In [ ]:
# Visualization
from visualization.conflicts_listener import ConflictsListener
elegible_actions = calculator.elegible_actions.copy()
listener = ConflictsListener(elegible_actions)

metrics = ['Conflict Score', 'Absolute Conflict Score', 
           'Conflict Ratio',  'Number of Conflicts', 
           'Total Elegible Actions', 
           'Total Conflict Time', 'Total Elegible Time', 
           'Time per Conflict Action', 'Time per Elegible Action']

display(md(f'**Page conflict score: {calculator.get_page_conflict_score()}**'))

# Visualization
from utils.notebooks import get_date_slider_from_datetime
from ipywidgets import interact
from ipywidgets.widgets import Dropdown

if (calculator.get_page_conflict_score() != 0):
    interact(listener.listen,
             _range = get_date_slider_from_datetime(elegible_actions['rev_time']),
             granularity=Dropdown(options=['Yearly', 'Monthly', 'Daily'], value='Monthly'),
             black=Dropdown(options=metrics, value='Conflict Score'),
             red=Dropdown(options= ['None'] + metrics, value='None'))
In [ ]:
from IPython.display import display, Markdown as md
display(md("---"))
display(md(f'## B.4 Editir Conflict Score'))
display(md(f"***Page: {the_page['title']}***"))
display(md("""Similar to the previous idea, it is possible to calculate the editor conflict score. The
table below presents the conflict score and other related  metrics per editor (*editor_id* and *editor*
column):

- **conflict_n**: the total number of conflicts
- **conflict**: the sum of conflict scores of all actions (without division)
- **actions**: the total number of actions performed by the editor
- **conflict_score**: the sum of conflict scores of all actions divided by the number of elegible actions
- **conflict_ratio**: the count of all conflicts divided by the number of elegible actions
"""))
In [ ]:
editors_conflicts = calculator.get_conflict_score_per_editor()

editors['editor_id'] = editors['editor_id'].astype(str)
if len(editors_conflicts) > 0:
    editors_conflicts = editors[['editor_id', 'name']].merge(editors_conflicts, 
                                                right_index=True, left_on='editor_id').set_index('editor_id')
    qg_obj = qgrid.show_grid(editors_conflicts)
    display(qg_obj)
else:
    display(md(f'**There is no Conflict Scores**')) 
    editors_conflicts = None
    
In [ ]:
from IPython.display import HTML
from utils.notebooks import get_next_notebook, get_previous_notebook

%store agg_actions
%store calculator
%store editors_conflicts

clear_output()
        

if len(editors_conflicts) > 0:
    display(HTML(f'<a href="{get_next_notebook()}" target="_blank">Go to next workbook</a>'))
else:
    display(HTML(f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>'))