%load_ext autoreload
%autoreload 2
%store -r page
page_name = 'The_Camp_of_the_Saints'
from external.wikipedia import WikipediaDV, WikipediaAPI
wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org'))
page = wikipedia_dv.get_page(page_name)
page.to_frame('value')
from wikiwho_wrapper import WikiWho
import pandas as pd
wikiwho = WikiWho(lng='en')
editions = wikiwho.dv.editions(page.page_id)
editions.head()
df = editions
df['year_month'] = pd.to_datetime(df['year_month'])
# Group the data by year month and page (drop the editor information)
df.drop('editor_id', axis=1).groupby(['year_month','page_id']).sum()
# add columns with the total actions
df = df.join(pd.DataFrame(
df.loc[:,'adds':'adds_stopword_count'].values +\
df.loc[:,'dels':'dels_stopword_count'].values +\
df.loc[:,'reins':'reins_stopword_count'].values,
index=df.index,
columns=['actions',
'actions_surv_48h',
'actions_persistent',
'actions_stopword_count']
))
# Visualization
from visualization.editions_listener import DFListener
from ipywidgets import interact
listener = DFListener(df)
# interact(listener.editions_per_month,
# begin=df.year_month,
# end=df.year_month.sort_values(ascending=False),
# actions=['All Actions', 'Additions', 'Reinsertions', 'Deletions'])
actions = df.loc[:,'actions':'actions_stopword_count'].columns.append(
df.loc[:,'adds':'reins_stopword_count'].columns)
interact(listener.editions_per_month,
begin=df.year_month,
end=df.year_month.sort_values(ascending=False),
granularity=['Yearly', 'Monthly'],
first_action=actions,
second_action=actions)
editors = editions['editor_id'].unique()
editors
from wikiwho_wrapper import WikiWhoAPI, DataView as WikiWhoDV
from metrics.Conflict_Score import conflictScore_token_list
api = WikiWhoAPI(lng='en')
wikiwho_dv = WikiWhoDV(api)
revisions = wikiwho_dv.rev_ids_of_article(page.page_id)
tokens = wikiwho_dv.all_content(page.page_id)
from metrics.Conflict_Score import conflictScore_token_list
conflictScore_token_list(tokens, 162969, revisions)
revisions.head()
tokens.loc[tokens['in'] == -1,'in'] = tokens.loc[tokens['in'] == -1,'o_rev_id']
tokens.head()
#tokens[tokens['o_editor']=='25450560']
rev_ins = revisions[['rev_time', 'rev_id', 'o_editor']].rename(
columns={'rev_time': 'in_rev_time',
'rev_id': 'in',
'o_editor': 'in_editor'}
)
rev_outs = revisions[['rev_time', 'rev_id', 'o_editor']].rename(
columns={
'rev_time': 'out_rev_time',
'rev_id': 'out',
'o_editor': 'out_editor'
}
)
fulltokens = pd.merge(tokens, rev_ins, how='left', on='in')
fulltokens = pd.merge(fulltokens, rev_outs, how='left', on='out')
fulltokens.head()
tokens.set_index('token_id')
possible_conflicts = tokens.groupby('token_id').size() > 4
possible_conflicts[possible_conflicts].index
tmp = tokens[tokens['token_id'].isin(possible_conflicts[possible_conflicts].index)]
tmp.groupby('token_id').shift(1)
for name, tgroup in tmp.groupby('token_id'):
pass
tgroup
tgroup.shift(-1).reset_index()
rgrouprn = tgroup.rename(
columns={'in': 'rev_id_in', 'out':'rev_id_out'}).reset_index()
rgrouprn
pd.wide_to_long(rgrouprn,
'rev_id',
'index',
'test', sep='_', suffix='.+')
from wikiwho_wrapper import WikiWhoAPI, DataView as WikiWhoDV
from metrics.Conflict_Score import conflictScore_token_list
api = WikiWhoAPI(lng='en')
wikiwho_dv = WikiWhoDV(api)
revisions = wikiwho_dv.rev_ids_of_article('Evolution')
revisions = revisions.rename(columns={'o_editor': 'editor'})
revisions['Evolution'] = pd.to_datetime(revisions['rev_time'])
#revisions['rev_time__'] = pd.to_datetime(dups_sorted['rev_time'],format='%Y-%m-%dT%H:%M:%SZ')
#revisions['rev_time__'] = dups_sorted['rev_time'].str.,format='%Y-%m-%dT%H:%M:%SZ')
#rev_id = revisions[revisions['rev_time'] < pd.Timestamp(2016,11,1)].sort_values(
# 'rev_time', ascending=False).iloc[0,:]['rev_id']
spec_revision = wikiwho.dv.specific_rev_content_by_article_title(the_page, rev_id)
spec_tokens = pd.merge(
spec_revision.drop(columns=['rev_id', 'rev_time', 'rev_editor']),
tokens[['token_id', 'in', 'out']],
how='left', on='token_id')
spec_tokens.shape
#tokens_alt = tokens.copy()
#tokens_alt.loc[tokens_alt['in'] == -1,'in'] = tokens_alt.loc[tokens_alt['in'] == -1,'o_rev_id']
#tokens_alt.shape
from wikiwho_wrapper import WikiWho
the_page = 'Chicago' #
#the_page = page.page_id
wikiwho = WikiWho(lng='en')
revisions = wikiwho.dv.rev_ids_of_article(the_page)
import pandas as pd
revisions = revisions.rename(columns={'o_editor': 'editor'})
revisions['rev_time'] = pd.to_datetime(revisions['rev_time'])
revisions.shape
tokens = wikiwho.dv.all_content(the_page)
tokens.shape
tokens.loc[tokens['in'] == -1,'in'] = tokens.loc[tokens['in'] == -1,'o_rev_id']
the_tokens = tokens #spec_tokens
dups = the_tokens[the_tokens.duplicated(subset=['token_id'], keep=False)]
#dups = the_tokens#[the_tokens['in'] != -1]
dups.shape
stop_words = open('data/stopword_list.txt', 'r').read().split()
dups = dups[~dups['token'].isin(stop_words)]
dups.shape
dups_long = pd.wide_to_long(
dups.rename(columns={'in': 'rev_id_in',
'out':'rev_id_out'}).reset_index(),
'rev_id','index','action', sep='_', suffix='.+'
).reset_index().drop(columns='index').sort_values('token_id')
dups_long.head()
dups_merged = pd.merge(dups_long, revisions[['rev_time', 'rev_id', 'editor']],
how='left', on='rev_id')
dups_sorted = dups_merged.sort_values(['token_id', 'rev_time'])
dups_sorted['time_diff'] = dups_sorted['rev_time'] - dups_sorted.shift(2)['rev_time']
to_remove = ((dups_sorted['o_rev_id'] == dups_sorted['rev_id']) |
(dups_sorted.shift(1)['o_rev_id'] == dups_sorted.shift(1)['rev_id']))
dups_sorted.loc[to_remove,'time_diff'] = np.nan
# for testing (the bottom line is equivalent to the above 3 but slow)
#dups_sorted['time_diff2'] = dups_sorted.groupby('token_id').apply(lambda group: group['rev_time'] - group.shift(2)['rev_time']).values
#(dups_sorted['time_diff'].fillna(-1) == dups_sorted['time_diff2'].fillna(-1)).all()
dups_sorted.shape
dups_dated = dups_sorted
#dups_dated = dups_sorted[dups_sorted['rev_time'] < pd.Timestamp(2016,10,10)]
# removes the last out
#dups_dated = dups_dated[dups_dated['rev_id'] != -1]
#dups_not_minus.shape
conflicts = ((dups_dated['token_id'] == dups_dated.shift(1)['token_id']) &
(dups_dated['token_id'] == dups_dated.shift(2)['token_id']) &
(dups_dated['editor'] != dups_dated.shift(1)['editor']) &
(dups_dated['editor'] == dups_dated.shift(2)['editor']))
dups_dated[conflicts].shape
#dups_dated[dups_dated['token_id'] == 1760]
import numpy as np
c_t = 1 / (
np.log(
dups_dated.loc[conflicts,['token_id','time_diff']].groupby(
'token_id').sum().astype('timedelta64[s]') + 2
) / np.log(3600))
c_t.sum()# / dups_not_minus.shape[0]
c_t = np.log(3600) / (
np.log(
dups_dated.loc[conflicts,['token_id','time_diff']].groupby(
'token_id').sum().astype('timedelta64[s]') + 2
))
c_t.sum()
dups_dated['conflict'] = 0
dups_dated.loc[conflicts, 'conflict'] = np.log(3600) / np.log(
dups_dated.loc[conflicts,'time_diff'].astype('timedelta64[s]')+2)
# editor = '25450560'
# actions = len(dups_dated[(dups_dated['editor'] == editor) & dups_dated['time_diff'].notnull()])
# dups_dated.loc[conflicts & (dups_dated['editor'] == editor),'conflict'].sum() / actions
dups_dated.loc[conflicts, 'conflict'].sum() / len(dups_dated[dups_dated['time_diff'].notnull()])
dups_dated.loc[conflicts, 'conflict'].sum() / len(dups_dated['rev_id'] == dups_dated['o_rev_id'])
confs_ed = dups_dated.loc[conflicts, ['editor', 'conflict']].groupby('editor').sum()
actions = dups_dated.loc[dups_dated['time_diff'].notnull(),['editor','action']].groupby('editor').count()
joined = confs_ed.join(actions)
joined['conflict_score'] = joined['conflict'] / joined['action']
joined.sort_values(
'conflict_score', ascending=False)
import numpy as np
editor = '25450560'
conflictse = conflicts & (dups_dated['editor'] == editor)
actions = len(dups_dated[(dups_dated['editor'] == editor) & dups_dated['time_diff'].notnull()])
c_t = 1 / (
np.log(
dups_dated.loc[conflictse,['token_id','time_diff']].groupby(
'token_id').sum().astype('timedelta64[s]') + 2
) / np.log(3600))
c_t.sum() / actions # / dups_not_minus.shape[0]
import numpy as np
c_t = dups_dated.loc[conflicts,:].groupby('token_id').size()
c_t.sum()
conflicts = (
(dups_sorted['token_id'] == dups_sorted.shift(1)['token_id']) &
(dups_sorted['token_id'] == dups_sorted.shift(2)['token_id']) &
(dups_sorted['editor'] != dups_sorted.shift(1)['editor']) &
(dups_sorted['editor'] == dups_sorted.shift(2)['editor']))
rev_ins = revisions[['rev_time', 'rev_id', 'o_editor']].rename(
columns={'rev_time': 'in_rev_time',
'rev_id': 'in',
'o_editor': 'in_editor'}
)
rev_outs = revisions[['rev_time', 'rev_id', 'o_editor']].rename(
columns={
'rev_time': 'out_rev_time',
'rev_id': 'out',
'o_editor': 'out_editor'
}
)
fulltokens = pd.merge(tokens, rev_ins, how='left', on='in')
fulltokens = pd.merge(fulltokens, rev_outs, how='left', on='out')
fulltokens.head()
def counting_token_conflict(tkn_group):
return ((tkn_group['editor'] == tkn_group.shift(2)['editor']) &
(tkn_group['editor'] != tkn_group.shift(1)['editor'])).sum()
res = dups_sorted.groupby('token_id').apply(lambda x: counting_token_conflict(x))
res
def counting_token_conflict(tkn_group):
conflicts = ((tkn_group['editor'] == tkn_group.shift(2)['editor']) &
(tkn_group['editor'] != tkn_group.shift(1)['editor']))
return tkn_group.loc[conflicts, 'time_diff'].sum()
res2 = dups_sorted.groupby('token_id').apply(lambda x: counting_token_conflict(x))
#from datetime import timedelta
#res2[res2 > timedelta(0)]
res2
def counting_token_conflict(tkn_group):
return ((tkn_group['editor'] == tkn_group.shift(2)['editor']) &
(tkn_group['editor'] != tkn_group.shift(1)['editor'])).sum()
dups_sorted.groupby('token_id').apply(lambda x: counting_token_conflict(x))
# (dups_sorted['action'] == dups_sorted.shift(1)['action']).sum()
# for tkn, tkn_group in dups_sorted.groupby('token_id'):
# pass
dups_sorted[dups_sorted['token_id'] == 7162]