# load related library
# related library
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import geopandas as gpd
import ipywidgets
from ipywidgets import widgets
from ipywidgets import *
from IPython.display import display,clear_output
from ipywidgets import Layout
from traitlets import directional_link
from datetime import datetime
from datetime import date
from dateutil import rrule
from boto.s3.connection import S3Connection
import requests
from io import BytesIO
# # when I run it in mybinder, it comes up with RuntimeWarning:
# /srv/conda/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
# return f(*args, **kwds)
# I looked for information online, it said this warning could be ignored safely.
#ipywidgets.__version__,matplotlib.__version__,pd.__version__,np.__version__,gpd.__version__
#ipywidgets.__version__,matplotlib.__version__,pd.__version__,np.__version__,gpd.__version__,quilt
#ipywidgets.__version__,matplotlib.__version__,pd.__version__,np.__version__,gpd.__version__
# prepare data for dropdown: continent list and country list
# get world info
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# get continent list plus "all"
continent_list = ["all"] + list(set(world['continent']))
#continent_list.append("all")
continent_country_dict = {}
for continent in continent_list:
continent_country_dict[continent] = ["all"] + list(world[world['continent']== continent]['name'])
#continent_country_dict[continent].append("all")
#print ("continent list for dropdown:")
#print (continent_list)
# load dataset
# If you want to do it locally, you can download dataset from the links below and load it using pd.read_csv just like comments did.
# https://www.dropbox.com/sh/mt7by5f1wgl6n3z/AACddwkFPq5lPpH3ry83MgSDa?dl=0
# Load local dataset.
a_people = pd.read_csv("src/article_people_score_date_geo.csv")
t_people = pd.read_csv("src/talk_people_score_date_geo.csv")
a_events = pd.read_csv("src/article_events_score_date_geo.csv")
t_events = pd.read_csv("src/talk_events_score_date_geo.csv")
complete_df = {}
complete_df['Articles'] = {}
complete_df['Talks'] = {}
complete_df['Articles']['People'] = a_people
complete_df['Articles']['Events'] = a_events
complete_df['Talks']['People'] = t_people
complete_df['Talks']['Events'] = t_events
# side functions
# Please run them before ploting
def exclude_BC(df):
bc_list = [one for one in df['date'] if one.startswith("-")]
df_withoutBC = df[~df['date'].isin(bc_list)]
#print ("There are %d dates before Christ,%d entities left after excluding"%(len(bc_list),len(df_withoutBC)))
return df_withoutBC
def add_datetime_column(df,unit):
# unit: date,month,year
datetime_format = ""
if unit == "date":
datetime_format = "%Y-%m-%d"
elif unit == "month":
datetime_format = "%Y-%m"
elif unit == "year":
datetime_format = "%Y"
new_list = [datetime.strptime(one,datetime_format) for one in df[unit]]
if unit == "date":
df = df.assign(datetime_date=pd.Series(new_list).values)
elif unit == "month":
df = df.assign(datetime_month=pd.Series(new_list).values)
elif unit == "year":
df = df.assign(datetime_year=pd.Series(new_list).values)
return df
def add_month_column(df):
new_list = [one[:7] for one in df['date']]
df = df.assign(month=pd.Series(new_list).values)
return df
def add_year_column(df):
new_list = [one[:4] for one in df['date']]
df = df.assign(year=pd.Series(new_list).values)
return df
def score_median_group_by_column(df,tcn,scn):
#print (df)
out_df = df.groupby(tcn).agg({scn:np.median}).reset_index()
#print (df)
return out_df
# df_month_score=score_median_group_by_column(monthSenti,'month','score')
# print (df_month_score.head())
def score_percentile_group_by_column(df,tcn,scn,percentile, lex):
#print (df)
# Debug:
per_str = '{}%'.format(int(percentile*100))
df_describe = df.groupby(tcn).describe()
df_pos = df_describe[f'pos_score_{lex}'][[per_str]].rename(columns={per_str:f'pos_score_{lex}'})
df_neg = df_describe[f'neg_score_{lex}'][[per_str]].rename(columns={per_str:f'neg_score_{lex}'})
df_len = df_describe['length'][[per_str]].rename(columns={per_str:'length'})
df_tot = df_describe['total'][[per_str]].rename(columns={per_str:'total'})
out_df = pd.concat([df_pos, df_neg, df_len, df_tot], axis=1).reset_index()
#out_df = df.groupby(tcn).quantile(percentile).reset_index()
#print (tcn)
#print (out_df)
return out_df
# plot function
def prepare_flowplot_percentiles(lexicon,sentiment,group,domain,geo,times,ra,window,unit,min_length):
# lexicon,sentiment,group,domain,geo,time,ra,unit
# lexicon: OL, MPQA, LIWC, ANEW
# sentiment: pos, neg, total
# group: [group_people,group_events,group_others] boolean value
# domain: Articles, Talks
# geo: [continent,country] continent,country could be 'all'
# times: [start_year,start_month,end_year,end_month] int
# unit: 'year' or 'month'
# output_text
statistics = []
dict_a_people = {"total":"1,146,257","date":"775,664","BC":"27"}
dict_a_events = {"total":"54,071","date":"22,582","BC":"33"}
dict_t_people = {"total":"415,124","date":"289,108","BC":"26"}
dict_t_events = {"total":"21,621","date":"10,283","BC":"27"}
statistics_dict = {"Articles":{"People":dict_a_people,"Events":dict_a_events},"Talks":{"People":dict_t_people,"Events":dict_t_events}}
birth_occurrance = {"People":"birth","Events":"occurrance"}
# prepare showed dates depending on time range
# get time range
start_yy,start_mm,end_yy,end_mm = times
start_date = date(start_yy,start_mm,1)
end_date = date(end_yy,end_mm,1)
# prepare x-axis
xaxis_value = [day for day in rrule.rrule(rrule.MONTHLY,dtstart=start_date,until=end_date)]
#print ("value for x-axis looks like this: \n%s \nlen(x-axis): %d"%(xaxis_value[-1],len(xaxis_value)))
# prepare dataframe to display
df = pd.DataFrame()
#group_people,group_events,group_others = group
# choose corresponding dataframe depend on domain and group
for one in group:
if one.value:
# add statistics for total entities for this group
statistics.append("There are totally %s %s entities in Wikipedia %s."%(statistics_dict[domain][one.description]['total'],one.description,domain))
statistics.append("Among them, %s have %s date information. Inside them we exclude %s entities whose date before Christ (BC)."%
(statistics_dict[domain][one.description]['date'],birth_occurrance[one.description],statistics_dict[domain][one.description]['BC']))
if df.empty:
df = complete_df[domain][one.description][['pos_score_'+lexicon,'neg_score_'+lexicon,'date','country','continent','length']]
else:
df = pd.concat([df,complete_df[domain][one.description][['pos_score_'+lexicon,'neg_score_'+lexicon,'date','country','continent','length']]])
if df.empty:
statistics.append("Please select value for Group.")
return False,False,False,False,False,False,False,statistics,False,False
statistics.append("After all, there are {:,d} entities with AC date information in this run.".format(len(df)))
# filter length
df = df[df['length']>=min_length]
#print (len(df))
statistics.append("After filtering with the length of the document, there are {:,d} entities left.".format(len(df)))
# filter out entities based on geo
continent,country = geo
# if continent == 'all', do nothing
if not continent == 'all':
if country == 'all':
# filter out entities with target continent
df = df[df['continent']==continent]
else:
if country == 'United States of America':
country = 'United States'
# filter out entities with target country
df = df[df['country']==country]
statistics.append("After filtering with area, there are {:,d} entities left.".format(len(df)))
df_test = df.copy()
if len(df)==0:
statistics.append("No entities fit the requirements, please change the settings.")
return False,False,False,False,False,False,False,statistics,False,False
# add target unit and corresponding datetime type for entities
if unit == "month":
df = add_month_column(df)
elif unit == "year":
df = add_year_column(df)
# we need datetime type because we use it to filter out entities within time range
df = add_datetime_column(df,unit)
# filter out entities based on time range
df = df[df["datetime_"+unit].isin(xaxis_value)]
statistics.append("After filtering with date, there are {:,d} entities left, and collected in the plot.".format(len(df)))
if len(df)==0:
statistics.append("No entities fit the requirements, please change the settings.")
return False,False,False,False,False,False,False,statistics,False,False
# get score depending on lexicon and sentiment
df = df.assign(total=df["pos_score_"+lexicon]+df["neg_score_"+lexicon])
if sentiment == "total":
score_column = "total"
else:
score_column = sentiment+"_score_"+lexicon
# plot flowplot
# get median
df_median = score_median_group_by_column(df,unit,score_column)
df_median = add_datetime_column(df_median,unit)
df_25percentile = score_percentile_group_by_column(df,'datetime_'+unit,score_column,0.25, lexicon)
df_75percentile = score_percentile_group_by_column(df,'datetime_'+unit,score_column,0.75, lexicon)
#print ("here")
#print (df_25percentile)
# filling blank
fill_temp = pd.DataFrame(xaxis_value,columns=['datetime_'+unit])
fill_df_median = fill_temp.merge(df_median,how='left')
fill_df_median.loc[fill_df_median[score_column].isnull(),score_column]=0
# Debug
fill_df_25percentile = fill_temp.merge(df_25percentile,how='left', left_on='datetime_month', right_on='datetime_month')
fill_df_25percentile.loc[fill_df_25percentile[score_column].isnull(),score_column]=0
# Debug
fill_df_75percentile = fill_temp.merge(df_75percentile,how='left', left_on='datetime_month', right_on='datetime_month')
fill_df_75percentile.loc[fill_df_75percentile[score_column].isnull(),score_column]=0
if ra == True:
# get rolling average
# df_median["ra"] = df_median[score_column].rolling(window,center=True).mean()
# df_25percentile["ra"] = df_25percentile[score_column].rolling(window,center=True).mean()
# df_75percentile["ra"] = df_75percentile[score_column].rolling(window,center=True).mean()
fill_df_median["ra"] = fill_df_median[score_column].rolling(window,center=True).mean()
fill_df_25percentile["ra"] = fill_df_25percentile[score_column].rolling(window,center=True).mean()
fill_df_75percentile["ra"] = fill_df_75percentile[score_column].rolling(window,center=True).mean()
# plot flowplot
#ax.plot(df_median['datetime_'+unit],df_median[score_column],'r-')
temp_x = fill_df_median['datetime_'+unit]
temp_x = temp_x.values
#temp_x = pd.Series.values(temp_x)
#ax.fill_between(temp_x,df_25percentile[score_column],df_75percentile[score_column],color='b',alpha=0.2)
# Debug
df_time_size = df.groupby('datetime_'+unit).size().reset_index(name='size')
fill_df_time_size = fill_temp.merge(df_time_size,how='left').fillna(0)
#print (df_time_size)
return True, fill_df_median,fill_df_25percentile,fill_df_75percentile,score_column,temp_x,fill_df_time_size,statistics,start_date,end_date
# # set ax
# myfontsize = 10
# ax.set_title("median,percentile of "+sentiment+" score for "+group+" based on "+lexicon,fontsize=myfontsize)
# ax.set_xlabel('time')
# ax.set_ylabel('score')
# ax.set_xlim([start_date,end_date])
# [[item.set_color('b') for item in bp_dict[key]['boxes']] for key in bp.keys()]
# [[item.set_color('b') for item in bp_dict[key]['whiskers']] for key in bp.keys()]
# [[item.set_color('r') for item in bp_dict[key]['medians']] for key in bp.keys()]
# for tick in bp_axes.get_xticklabels():
# tick.set_rotation(90)
#plt.show()
#prepare_flowplot_percentiles("OL","total","People",domain,geo,times,ra,window,unit)
#flowplot_percentiles("OL","total","People",["Europe","all"],"Articles","month",[1940,1,1],[1960,1,1])
# widget 1: median of sentiment score for Wikipedia concepts over time grouped by month
# define actions while click update botton
def on_button_clicked(b):
# prepare and filter dataset
# get parameters from input
# get geo
continent = dropdown_continent.value
country = dropdown_country.value
geo = [continent,country]
# get time
time_start_year = dropdown_start_year.value
time_start_month = dropdown_start_month.value
time_end_year = dropdown_end_year.value
time_end_month = dropdown_end_month.value
time = [time_start_year,time_start_month,time_end_year,time_end_month]
# get lexicon
lexicon = radio_button_lexicon.value
# get sentiment
sentiment = sen_dict[radio_button_sentiment.value]
# get domain
domain = radio_button_domain.value
# get group
# group_people,group_events,group_others = cb_container.children
group = cb_container.children
# get rolling average
ra = checkbox_ra.value
window = dropdown_ra.value
# get minimum length
min_length = dropdown_length.value
# define time unit to month
unit = "month"
# output text
#output_label_list = []
output_label_str_container = widgets.VBox(layout=Layout(width='100%',border='solid 1px'))
exist, df50,df25,df75,score_column,percentile_x,df_time_size,output_label_list,s_date,e_date =prepare_flowplot_percentiles(lexicon,sentiment,group,domain,geo,time,ra,window,unit,min_length)
if exist == False:
fig = None
else:
# if rolling average==True, get column 'ra' instead of
if ra == True:
score_column = 'ra'
# draw
# Debug
fig,axes = plt.subplots(2,1,sharex='row', figsize=(15,8))
ax0,ax1 = axes.flatten()
ax = [ax0,ax1]
time_size_loc = np.arange(len(df_time_size['datetime_'+unit].tolist()))
ax[0].bar(time_size_loc,df_time_size['size'],width=0.8, align='edge')
df50.plot(x='datetime_'+unit, y=score_column, color='r', ax=ax[1])
ax[1].fill_between(percentile_x,df25[score_column],df75[score_column],color='b',alpha=0.2)
ax[0].set_xlim(0,time_size_loc.shape[0])
ax[0].get_xaxis().set_visible(False)
ax[1].get_legend().remove()
# add label for plots
plot_title_dict = {"ra":"rolling average of median score"}
ax[0].set_title("Number of entities from %s to %s matching the current setting (grouped by month)"%(s_date.strftime("%B %Y"),e_date.strftime("%B %Y")))
ax[0].set_ylabel("number of entities")
ax[1].set_title("Corresponding %s for each month"%(plot_title_dict.get(score_column,"median score")))
ax[1].set_ylabel("score")
ax[1].set_xlabel("time")
#output_label_list.append("Totally %d entities are collected in the plot."%(sum(df_time_size['size'].tolist())))
output_labels = [widgets.HTML(value=i) for i in output_label_list]
output_label_str_container.children = [i for i in output_labels]
with box_out:
clear_output(wait=True)
display(fig)
with text_out:
clear_output(wait=True)
display(output_label_str_container)
if fig:
plt.close(fig)
# framework
# lexicon container: Which lexicon you want to choose? (OL, MPQA, LIWC)
# title html
# radio_button
# group container: Which group you want to show? (People, Events)
# title html
# checkboxes container:
# checkbox
# checkbox
# area filter: Do you want to filter out the continent or country for entities?
# title html
# dropdown_continent
# dropdown_country
# domain container: What domain you want to use? (Article, Talks)
# title html
# checkboxes container:
# radio_button
# time filter: Set the start and stop of time range
# dropdown
#
# change label to HTML (for set style more flexible)
container_width = 'auto'
with_border_layout = Layout(border='solid 0.5px')
# preparing a container for header
header_container = widgets.VBox(layout=Layout(width='100%',border='solid 0.5px'))
header_text = """<h1>WikiSentiFlow</h1><br>
<p>Introduction
<ul>
<li>This widget is used to show the changes of sentiment scores for Wikipedia entities (concepts) with time. It shows 25%, 50% (median), 75% quantile of sentiment score for each month to present the sentiment distribution.</li>
<li>Wikipedia entities contains entities in Wikipedia Article and Wikipedia Talk. Talk page is an area for editors to discuss about corresponding article, which can be visited from upper left side of article page.</li>
<li>This widget includes entities both People and Events. The date for people indicates birth date, while the date for events indicates occurrance date.</li>
<li>The text of Articles and Talks is extracted from Wikipedia Dump, and time stamps are extracted from DBPedia.</li>
<li>The scores are calculated with term frequency for sentiment words based on certain lexicons (OL, MPQA, LIWC, ANEW). For ANEW we take valency into account too.</li>
</ul></p>
<p>How to use
<ol>
<li>Select value for Lexicon, Sentiment, Group, Domain, Geolocation, Time, Minimum Length.</li>
<li>Rolling average is optional. Tick it and set a window size then a rolling average of median will be showed. Otherwise the median will be showed.</li>
<li>Click "Go" to run it.</li>
</ol>
</p>
<p>Results
<ul><li>For each run there will be two plots sharing x-axis.
The first plot is the number of entities matching the setting.
The second plot shows a red line with purple shadows representing the corresponding scores.
The red line presents median (50% quantile) of scores based on your settings.
And the shadows cover 25% to 75% quantile of the scores.
If rolling average is ticked, then the shadows and red line refer to rolling average of 25%, 50%, 75% quantile.
The time is splited with month as unit.</li>
<li>In the bottom you will get the data characteristics for the current run.</li></ul></p>"""
html_header = widgets.HTML(value=header_text)
header_container.children=[html_header]
# for lexicon
lexicon_container = widgets.VBox(layout=Layout(width='8%',border='solid 0.5px'))
# add title
html_lexicon = widgets.HTML(value="<b>Lexicon</b>")
#label_lexicon = widgets.Label(value="sentiment lexicon")
# preparing a container to put in radio buttons
radio_button_lexicon = widgets.RadioButtons(
options=['OL', 'MPQA', 'LIWC','ANEW'],
#description='sentiment lexicon',
#style=style,
disabled=False
)
# put text and button into lexicon container
lexicon_container.children = [html_lexicon,radio_button_lexicon]
# for sentiment
sentiment_container = widgets.VBox(layout=lexicon_container.layout)
# add title
html_sentiment = widgets.HTML(value="<b>Sentiment</b>")
# preparing a container to put in radio buttons
radio_button_sentiment = widgets.RadioButtons(
options=['Total', 'Positive', 'Negative'],
#description='',
#style=style,
disabled=False
)
# put text and button into lexicon container
sentiment_container.children = [html_sentiment,radio_button_sentiment]
# for target group
group_container = widgets.VBox(layout=lexicon_container.layout)
html_group = widgets.HTML(value="<b>Group</b>")
# checkboxes container
cb_container = widgets.VBox(layout=Layout(
))
# preparing a container to put in created checkbox
checkboxes = []
# create checkbox
checkboxes.append(widgets.Checkbox(description = 'People', value=False,layout=Layout(left='-80px')))
checkboxes.append(widgets.Checkbox(description = 'Events', value=False,layout=Layout(left='-80px')))
# put check box into checkboxes container
cb_container.children=[i for i in checkboxes]
#display(cb_container)
# # add a new container to control the arrangement
# temp_container = widgets.HBox()
# temp = widgets.Label(description='choose target group')
# temp_container.children=[cb_container,temp]
group_container.children=[html_group,cb_container]
# for area
area_container = widgets.VBox(layout=Layout(width='24%',border='solid 0.5px'))
html_area = widgets.HTML(value="<b>Geolocation</b>")
#dropdown_container = widgets.HBox()
#dropdown
dropdown_continent = widgets.Dropdown(
options = continent_list,
value = 'all',
description='Continent:',
layout=Layout(width='240px',left='-10px'),
disabled=False,
)
dropdown_country = widgets.Dropdown(
#options = country_list,
description='Country:',
#options = ["all"],
#value='all',
layout=Layout(width='240px',left='-10px'),
disabled = False,
)
def transform(case):
return continent_country_dict[case]
directional_link((dropdown_continent,'value'),(dropdown_country,'options'),transform)
area_container.children=[html_area,dropdown_continent,dropdown_country]
# for domain
domain_container = widgets.VBox(layout=lexicon_container.layout)
# add title
html_domain = widgets.HTML(value="<b>Domain</b>")
# preparing a container to put in radio buttons
radio_button_domain = widgets.RadioButtons(
options=['Articles', 'Talks'],
disabled=False
)
# put text and button into domain container
domain_container.children = [html_domain,radio_button_domain]
# for date
time_container = widgets.VBox(layout=Layout(width='26%',border='solid 0.5px'))
#add title
html_time = widgets.HTML(value="<b>Time</b>")
# prepare time year list
# dates = [day for day in rrule.rrule(rrule.YEARLY, dtstart=date(1700,1,1), until=date.today())]
# year_list = [(i.strftime('%Y'),i) for i in dates]
year_list = list(range(1500,2019,1))
month_list = list(range(1,13,1))
#time_list = [1940,1960,1992,1993]
time_style = {'description_width': '63%'}
dropdown_start_year = widgets.Dropdown(
options = year_list,
value=year_list[-100],
description='Start (year, month):',
style=time_style,
layout=Layout(width='190px'),
#style={'description_width':'initial'},
disabled=False,
)
dropdown_start_month = widgets.Dropdown(
options = month_list,
#description='',
layout=Layout(width='50px'),
disabled=False,
)
time_container_start = widgets.HBox()
#time_container_start.children=[widgets.Label('start (year,month):'),dropdown_start_year,dropdown_start_month]
time_container_start.children=[dropdown_start_year,dropdown_start_month]
dropdown_end_year = widgets.Dropdown(
options = year_list,
description='End (year, month):',
style=time_style,
#options = ["all"],
value=year_list[-1],
layout=Layout(width='190px'),
disabled = False,
)
dropdown_end_month = widgets.Dropdown(
options = month_list,
#description='',
#options = ["all"],
#value=month_list[-1],
layout=Layout(width='50px'),
disabled = False,
)
time_container_end = widgets.HBox()
time_container_end.children = [dropdown_end_year,dropdown_end_month]
time_container.children = [html_time,time_container_start,time_container_end]
# for rolling average
ra_container = widgets.VBox(layout=Layout(align_items='center',width='100%'))
#mystyle={'description_width':'initial'}
checkbox_ra = widgets.Checkbox(layout=Layout(left='-13px'),value=False,description='Rolling Average')
checkbox_ra_container = widgets.HBox()
checkbox_ra_container.children = [checkbox_ra]
ra_list = list(range(3,50,1))
ra_style = {'description_width': '60%'}
dropdown_ra = widgets.Dropdown(
options = ra_list,
description='Windows Size:',
style=ra_style,
#options = ["all"],
#value=year_list[-1],
layout=Layout(width='170px',left='-6px'),
disabled = True,
)
dropdown_ra_container = widgets.HBox()
dropdown_ra_container.children = [dropdown_ra]
def transform_ra(case):
return {True:False,False:True}[case]
directional_link((checkbox_ra,'value'),(dropdown_ra,'disabled'),transform_ra)
ra_container.children = [checkbox_ra_container,dropdown_ra_container]
# for min length
length_list = list(range(0,50,5))
length_style = {'description_width':'70%'}
dropdown_length = widgets.Dropdown(
options = length_list,
description='Minimum Length:',
style = length_style,
layout=Layout(width='176px',left='-9px'),
disabled=False,
)
#for button
update_container = widgets.VBox(layout=Layout(#display='flex',
#flex_flow='column',
align_items='center',
width='16%',
#width=container_width,
#border='solid 0.5px'
))
#add button that updates the graph based on the checkboxes
button = widgets.Button(description="Go",button_style='primary',layout=Layout(width='90%'))
update_container.children=[ra_container,dropdown_length,button]
# preparing a container for input panel
input_container = widgets.HBox(layout=Layout(
display='flex',
flex_flow='row',
align_items='stretch',
border='solid 0.5px',
#height='120px'
#width='30%'
))
input_container.children=[lexicon_container, sentiment_container,group_container,domain_container,area_container,time_container,update_container]
# for plot
box_out = ipywidgets.Output(layout=Layout(width='100%',height='500px',border='solid 0.5px'))
# for out_text
#label_out = widgets.Label(value="Here is the statistics of output...",layout=Layout(width='100%'))
text_out = widgets.Output(layout=Layout(width='100%',border='solid 0.5px'))
# for output
output_container = widgets.VBox(layout=Layout(border='solid 0.5px'))
output_container.children = [box_out,text_out]
# container for all: including header, input and output two sections
all_container = widgets.VBox(layout=with_border_layout)
all_container.children = [header_container,input_container,output_container]
sen_dict = {"Total":"total","Positive":"pos","Negative":"neg"}
# run the widget
display(all_container)
button.on_click(on_button_clicked)
VBox(children=(VBox(children=(HTML(value='<h1>WikiSentiFlow</h1><br>\n<p>Introduction\n<ul>\n<li>This widget i…
Link to WikiSentiScatter