In [ ]:
%load_ext autoreload
%autoreload 2
%store -r the_page

if 'the_page' not in locals():
    import pickle
    print("Loading default data...")
    the_page = pickle.load(open("data/the_page.p",'rb'))
In [ ]:
from IPython.display import display, Markdown as md
display(md(f"# ***Page: {the_page['title']}***"))
display(md(f" "))
In [ ]:
display(md("---"))
display(md(f"# A. Article actions and conflict"))
display(md(f"The [WikiWho API](https://www.wikiwho.net/en/api/v1.0.0-beta/) tracks the changes to every token (words or special characters) on a "
           "Wikipedia page with at least 95% accuracy. It distinguishes every token in the "
           'document even when the string appears several times. E.g. "and" at the beginning of an article is a different token then "and" at the end of the article. '
           "See also [this figure](https://www.wikiwho.net/#technical_details)."
          ))
display(md("That means that **not only edits** are counted, which can contain changes many different tokens, but *every single action to every single token* is recorded. Two actions can perfomed per token: i.e. **insertions** "
          'and **deletions** (a character change in a word, e.g. "dog" -> "dogs", is modeled as deletion of '
          '"dog" and the insertion of "dogs", two separate tokens). An **insertion** is also considered '
          "a **re-insertion** if the insertion has occured before; the only insertion of a token that is not "
          "a re-insertion is the first one. Similarly, a **deletion** is also considered a **re-deletion** if "
          "the deletion has occured before."))
display(md("Formally, the token history can represented by a time-ordered sequence of actions "
           "*(a<sub>0</sub>, ..., a<sub>n</sub>)*; note that *a<sub>0+2i</sub>* is always an insertion and "
           "*a<sub>1+2i</sub>* is always a deletion for i ∈ ℕ."))


display(md("---\n***IMPORTANT:*** For articles with a long revision history, "
           "please allow for some time to load (see cog wheel symbol right of 'edit app') "
           "before interacting with the controls too often."))
In [ ]:
from wikiwho_wrapper import WikiWho
import pandas as pd
import qgrid
# set the default max number of rows to 10 so the larger DataFrame we render don't take up to much space 
qgrid.set_grid_option('maxVisibleRows', 5)

wikiwho = WikiWho(lng='en')
agg_actions = wikiwho.dv.edit_persistence(the_page.page_id)

# define total columns
total_columns = ['total', 'total_surv_48h', 'total_persistent', 'total_stopword_count']

# add columns with the total actions
agg_actions = agg_actions.join(pd.DataFrame(
    agg_actions.loc[:,'adds':'adds_stopword_count'].values +\
    agg_actions.loc[:,'dels':'dels_stopword_count'].values +\
    agg_actions.loc[:,'reins':'reins_stopword_count'].values, 
    index=agg_actions.index, 
    columns=total_columns
))

display(md("## A.1 Total actions per month and editor"))
display(md(f"***Page: {the_page['title']}***"))
display(md("""The following table shows the total number of actions (insertions + deletions) per month 
(`year_month` column), and editor (`editor_id` and `editor` columns)."""))
display(md("""**Columns description:**
- **total**: total number of actions (insertions, and deletions)
- **total_surv_48h**: total number of actions that survived at least 48 hours
- **total_persistent**:  total number of actions that survived until, at least, the end of the month
- **total_stopword_count**:  total number of actions that were performed in stop words"""))

from IPython.display import clear_output
from ipywidgets import Output

# the output widget is used to update the qgrid
out = Output()
display(out)
with out:
    print("Downloading editor usernames (i.e. *editor* column)...")
    display(qgrid.show_grid(agg_actions[['year_month', 'editor_id'] + total_columns]))

# Grab user names from wikipedia and merge them to the editors_conflict dataframe
from external.wikipedia import WikipediaDV, WikipediaAPI
wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org'))
editors = wikipedia_dv.get_editors(agg_actions['editor_id'].unique()).rename(columns = {
    'userid': 'editor_id'})

# Merge the namesof the editors to the aggregate actions dataframe
agg_actions = agg_actions.merge(editors[['editor_id', 'name']], on='editor_id')
agg_actions.insert(3, 'editor', agg_actions['name'])
agg_actions = agg_actions.drop(columns=['name'])
agg_actions['editor'] = agg_actions['editor'].fillna("Unregistered")

with out:
    clear_output()
    display(qgrid.show_grid(agg_actions[['year_month', 'editor_id', 'editor'] + total_columns]))
In [ ]:
display(md("""## A.2. Visualization of actions per month"""))
display(md(f"***Page: {the_page['title']}***"))
display(md("""In the following graph you can select the *date range* and *granularity* (yearly, montly) 
of the timeline (X-axis), and plot any of the follow counts in the black, red, blue and green lines:
   
- **adds**: number of first-time insertions
- **adds_surv_48h**: number of insertions for the first time that survived at least 48 hours
- **adds_persistent**:  number of insertions for the first time that survived until, at least, the end of the month
- **adds_stopword_count**:  number of insertions that were stop words
- **dels**: number of deletions
- **dels_surv_48h**: number of deletions that were not resinserted in the next 48 hours
- **dels_persistent**: number of deletions that were not resinserted until, at least, the end of the month
- **dels_stopword_count**: number of deletions that were stop words
- **reins**: number of reinsertions
- **reins_surv_48h**: number of reinsertions that survived at least 48 hours
- **reins_persistent**: number of reinsertionsthat survived until the end of the month
- **reins_stopword_count**: number of reinsertionsthat were stop words

**What do these actions/counts mean?** For instance, if you see 10 "adds" in a month, but only 4 "adds_surv_48h", 10 completely new tokens/words have been added to the article, but only 4 of them stayed in the article for more than 2 days, which usually means the other 6 are gone for good. If "dels" are performed and don't survive, that means that these deletions have been undone, i.e., the deleted tokens have been put back. I.e., these are measurements of the longevity and stability of edit actions done to the article. 

"""))



# Convert to datetime
agg_actions['year_month'] = pd.to_datetime(agg_actions['year_month'])

# Group the data by year month and page (drop the editor information)
agg_actions.drop('editor_id', axis=1).groupby(['year_month','page_id']).sum().reset_index()

# Listener
from visualization.actions_listener import ActionsListener
listener = ActionsListener(agg_actions)
action_types = (agg_actions.columns[4:16]).values.tolist()

# Visualization
from utils.notebooks import get_date_slider_from_datetime
from ipywidgets import interact, fixed
from ipywidgets.widgets import Dropdown

interact(listener.listen,
         _range = get_date_slider_from_datetime(agg_actions['year_month']),
         editor=fixed('All'),
         granularity=Dropdown(options=['Yearly', 'Monthly'], value='Yearly'),
         black=Dropdown(options=action_types, value='adds'), 
         red=Dropdown(options= ['None'] + action_types, value='dels'),
         green=Dropdown(options= ['None'] + action_types, value='None'), 
         blue=Dropdown(options= ['None'] + action_types, value='None'))
In [ ]:
from IPython.display import display, Markdown as md
display(md("---"))
display(md(f'## A.3 Page Conflict'))
display(md(f"***Page: {the_page['title']}***"))

Our measurement of conflict for single tokens is taken from Flöck et al.:

  • (1) The main idea is to count how often a token - after being created (added) the first time - was being deleted, re-inserted, re-deleted, re-inserted, and so on; which would often happen in case two editors disagree on the token's justification to be in the text.
  • (2) Only the re-deletions and re-insertions are counted, since up to the first delete it could be a simple correction that didn't trigger a reponse - this wouldn't indicate conflict.
  • (3) The "re-" actions are only counted if they alternate between different editors and don't come from the same editor twice or more in a row - as the latter would simply indicate self-corrections.
  • (4) In a last step, each re-insertion/re-deletion interaction gets a higher weight the faster it occurs (see Flöck et al. for the exact formula).

The total conflict of a page is the sum of all the conflict scores of all actions with conflict (or conflict actions).

This total conflict can be normalized if the sum is divided by the number of actions that could potentially be counted as conflict (elegible actions, i.e "re-" actions that have occurred at least twice).

In the following graph you can select the date range and granularity (yearly, monthly) of the timeline (X-axis), and plot any of the following counts in the black and red lines:

  • Total: total number of actions (insertions, and deletions)
  • Total_surv_48h: total number of actions that survived at least 48 hours
  • Total_persistent: total number of actions that survived until, at least, the end of the month
  • Total_stopword_count: total number of actions that were performed in stop words
  • Total Elegible Actions: the total number of elegible actions
  • Conflict count: the total number of conflicts
  • Number of Revisions: the total number of revisions/edits
  • Conflict Score: the sum of conflict scores of all actions divided by the number of elegible actions
  • Absolute Conflict Score: the sum of conflict scores of all actions (without division)
  • Conflict Ratio: the count of all conflicts divided by the number of elegible actions
In [ ]:
# Visualization
from visualization.conflicts_listener import ConflictsListener
listener = ConflictsListener(agg_actions)

metrics = ['Total', 'Total_surv_48h', 'Total_persistent', 'Total_stopword_count',
           'Total Elegible Actions', 'Number of Conflicts', 'Number of Revisions',
           'Conflict Score', 'Absolute Conflict Score', 'Conflict Ratio']
conflict_score = agg_actions.conflict.sum() / agg_actions.elegibles.sum()
display(md(f'**Page conflict score: {conflict_score}**'))

# Visualization
from utils.notebooks import get_date_slider_from_datetime
from ipywidgets import interact
from ipywidgets.widgets import Dropdown

if (conflict_score != 0):
    interact(listener.listen,
             _range = get_date_slider_from_datetime(agg_actions['year_month']),
             granularity=Dropdown(options=['Yearly', 'Monthly'], value='Monthly'),
             black=Dropdown(options=metrics, value='Conflict Score'),
             red=Dropdown(options= ['None'] + metrics, value='None'))
In [ ]:
from IPython.display import display, Markdown as md
display(md("---"))
display(md(f'## A.4 Editor Conflict Score'))
display(md(f"***Page: {the_page['title']}***"))
display(md("""We can also calculate the conflict score for each individual editor. The
table below presents the conflict score and other related  metrics per editor (*editor_id* and *editor*
column):

- **conflicts**: the total number of conflicts
- **elegibles**: the total number of elegible actions performed by the editor
- **conflict**: the sum of conflict scores of all actions divided by the number of elegible actions
"""))
In [ ]:
editors_conflicts = agg_actions.groupby(pd.Grouper(
            key='editor_id')).agg({'conflicts': 'sum', 'elegibles': 'sum', 'conflict': 'sum'}).reset_index()
editors_conflicts['conflict'] = (editors_conflicts['conflict']/editors_conflicts['elegibles'])
if len(editors_conflicts) > 0:
    editors_conflicts = editors[['editor_id', 'name']].merge(editors_conflicts.dropna(), 
                                                right_index=True, on='editor_id').set_index('editor_id')
    qg_obj = qgrid.show_grid(editors_conflicts.dropna())
    display(qg_obj)
else:
    display(md(f'**There is no Conflict Scores**')) 
    editors_conflicts = None
In [ ]:
# create the api
from wikiwho_wrapper import WikiWho
wikiwho = WikiWho(lng='en')

from IPython.display import display, Markdown as md
# Get the content and revisions from the wikiwho api
display(md("Downloading all_content from the WikiWhoApi..."))
all_content = wikiwho.dv.all_content(the_page['page_id'])

display(md("Downloading revisions from the WikiWhoApi..."))
revisions = wikiwho.dv.rev_ids_of_article(the_page['page_id'])

from IPython.display import clear_output
clear_output()
In [ ]:
from metrics.conflict import ConflictManager
from wikiwho_wrapper import WikiWho
from IPython.display import clear_output
from IPython.display import HTML
from utils.notebooks import get_next_notebook, get_previous_notebook

# call the calculator
calculator = ConflictManager(all_content, revisions)
calculator.calculate()
clear_output()

# display the tokens, the difference in seconds and its corresponding conflict score
conflicts = calculator.conflicts.copy()
conflicts['time_diff_secs'] = conflicts['time_diff'].dt.total_seconds()
    
display(md(f'## B.1 Conflict score of each singular action'))
display(md(f"***Page: {the_page['title']}***"))
display(md("""In the following table, all the actions that are in conflict are presented, and a conflict score
is presented per action. The importance of the conflict can be meassure by considering the seconds t that 
have passed since the last action on the same token has occured (`time_diff_secs` column). A score to 
meassure conflict is calculated based on t with the following formula: 1 / log<sub>3600</sub>(t+2). 
Thus, *undo* actions are weighted higher than the original time in seconds when the *t* is less than an hour.
For details, please refer to [Flöck et al, 2017](https://arxiv.org/abs/1703.08244).
**Columns description:**
- **token**: the string of the token that is being tracked
- **token_id**: the id of the token that is being tracked
- **rev_id**: the revision id in which the action (insertion or deletion) happen
- **editor_id**: the id of the editor that inserted the token (if starts with **0|**, it means that
the editor is not registered, and the ip is displayed instead
- **time_diff_secs**: seconds that have passed since the last action on the same token has occured
- **conflict**: a score to meassure conflict that is calculated based on the `time_diff_secs` 
with the following formula: *1 / log<sub>3600</sub>(time_diff_secs + 2)*. For details, please refer to 
[Flöck et al, 2017](https://arxiv.org/abs/1703.08244)"""))

if len(conflicts) > 0:
    display(qgrid.show_grid(conflicts[[
        'action', 'token', 'token_id', 'rev_id', 
        'editor', 'time_diff_secs', 'conflict']].rename(columns={
        'editor': 'editor_id'}).sort_values('conflict', ascending=False)))
else:
    display(md(f'**There are no conflicting tokens in this page.**'))
    display(HTML(f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>'))
In [ ]:
from IPython.display import display, Markdown as md, HTML as html
display(md("---"))
display(md(f'## B.2 Most frequent conflicting token strings'))
display(md(f"***Page: {the_page['title']}***"))
                     
display(md(""" The WordCloud displays the most common conflicting token strings, i.e. words (token strings) 
with the most actions that have conflict. The size of the token string in the WordCloud indicates frequency 
of actions.
In the controls you can select the *date range*, the type of *action* (insertion or deletion), and the 
*source*. The *source* can be any of the following:
-   **Only Conflicts**: use only the actions that are in conflict.
-   **Elegible Actions**: use only the actions that can potentially enter into conflict, i.e. actions 
that have occurred at least twice, e.g. the token x has been inserted twice (which necessarily implies 
it was remove once), the token x has been deleted twice (which necessarily implies it was inserted twice) 
-   **All Actions**: use all tokens regardles conflict
"""))
In [ ]:
# listener
from visualization.wordcloud_listener import WCListener

listener = WCListener(sources = {
    'All actions': calculator.all_actions,
    'Elegible Actions': calculator.elegible_actions,
    'Only Conflicts': calculator.conflicts
})

# visualization
from utils.notebooks import get_date_slider_from_datetime
from ipywidgets import interact, fixed

from ipywidgets.widgets import Dropdown, HTML, interactive_output, VBox

_range=get_date_slider_from_datetime(calculator.all_actions['rev_time'])
source=Dropdown(options=list(listener.sources.keys()), value='Only Conflicts', description='Source (*)')
action=Dropdown(options=['Both', 'Just Insertions', 'Just Deletions'], value='Both', description='Action')
editor=fixed('All')

out = interactive_output(listener.listen, {
         '_range': _range,
         'source': source,
         'action': action,
         'editor': editor})

display(VBox([_range, action, source, out]))
In [ ]:
from IPython.display import HTML
from utils.notebooks import get_next_notebook, get_previous_notebook

%store agg_actions
%store calculator
%store editors_conflicts

clear_output()
        

if len(editors_conflicts) > 0:
    display(HTML(f'<a href="{get_next_notebook()}" target="_blank">Go to next workbook</a>'))
else:
    display(HTML(f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>'))