The following is a public-facing walkthrough of the code used to for the predictive tool that powers @WorldLeaderTips. The styling of the out graph has been changed for this context.
%pylab inline
Populating the interactive namespace from numpy and matplotlib
import json
import random
import datetime as dt
from collections import deque
import numpy as np
import pandas as pd
from pandas import *
from pandas.io.json import json_normalize
import matplotlib.pylab
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import statsmodels.api as sm
from statsmodels.tsa import *
from bigquery import get_client
tomorrow = str(dt.date.today())
twoweeksago = str(dt.date.today() - dt.timedelta(days = 14))
lastweek = dt.datetime.today() - dt.timedelta(days = 7)
nextweek = dt.date.today() + dt.timedelta(days = 8)
day_index = dt.datetime.today().weekday()
# Initialize Google BigQuery client:
with open('/PATH/TO/settings.json', 'rb') as json_data:
settings = json.load(json_data)
client = get_client(settings['PROJECT_ID'], service_account=settings['SERVICE_ACCOUNT'], private_key=settings['KEY'], readonly=True)
# Load the Goldstein suggstions table:
gsCodes = pd.read_csv('assets/goldstein_suggestions.tsv', index_col='code', sep='\t')
leader_display_name = 'Pope Francis'
gdelt_search = "WHERE Actor1Name CONTAINS 'POPE' AND SQLDATE > 20130313"
date_start = '19790101'
date_end = ''.join(str(dt.date.today()).split('-'))
query_template = 'SELECT GLOBALEVENTID, SQLDATE, Actor1Name, Actor1CountryCode, GoldsteinScale, NumMentions FROM [gdelt-bq:full.events] {} AND SQLDATE>{} AND SQLDATE<{} IGNORE CASE;'
query = query_template.format(gdelt_search, date_start, date_end)
print 'Search for: {}'.format(leader_display_name)
print 'Query: {}'.format(query)
Search for: Pope Francis Query: SELECT GLOBALEVENTID, SQLDATE, Actor1Name, Actor1CountryCode, GoldsteinScale, NumMentions FROM [gdelt-bq:full.events] WHERE Actor1Name CONTAINS 'POPE' AND SQLDATE > 20130313 AND SQLDATE>19790101 AND SQLDATE<20150808 IGNORE CASE;
filename = '{}.csv'.format(leader_display_name.replace(' ', '_'))
try:
job_id, results = client.query(query, timeout=2000)
print '{} results'.format(str(len(results)))
if len(results) == 0:
print "NO RESULTS"
df = json_normalize(results)
df = df.sort(columns='SQLDATE')
df.to_csv('{}'.format(filename), encoding='utf-8')
print 'Saved: {}'.format(filename)
except:
print 'ERROR: Timed out'
56788 results Saved: Pope_Francis.csv
df = pd.read_csv(filename, index_col='SQLDATE', parse_dates=True)
df.tail()
Unnamed: 0 | Actor1CountryCode | Actor1Name | GLOBALEVENTID | GoldsteinScale | NumMentions | |
---|---|---|---|---|---|---|
SQLDATE | ||||||
2015-08-07 | 39523 | NaN | THE POPE | 455865092 | 1.0 | 1 |
2015-08-07 | 39522 | NaN | THE POPE | 455865091 | 1.0 | 1 |
2015-08-07 | 39521 | NaN | THE POPE | 455865090 | 1.0 | 7 |
2015-08-07 | 39535 | NaN | THE POPE | 455885123 | 1.9 | 10 |
2015-08-07 | 39619 | NaN | THE POPE | 456037032 | 0.0 | 10 |
# These steps incorporate the number of mentions a Goldstein score is associated with, reducing the impact of error in the event encoding, making the average better reflect the event's presence in the GDELT.
df['GoldMentions'] = df['GoldsteinScale'] * df['NumMentions']
goldstein = df.groupby([df.index.date]).agg({'GoldMentions': np.sum, 'NumMentions': np.sum})
goldstein['GoldAverage'] = goldstein['GoldMentions'] / goldstein['NumMentions']
full_daterange = pd.date_range(start=min(df.index), end=max(df.index))
# interpolate() takes care of days that do not have a entry / Goldstein score in GDELT:
goldstein = goldstein.reindex(full_daterange).interpolate(method='linear')
# In case there is not enough historical data for the world leader:
if len(goldstein['GoldAverage']) < 230:
day_difference = 230 - len(goldstein['GoldAverage']) + 1
new_dates = [goldstein.index[0] - dt.timedelta(days=x) for x in range(1, day_difference)]
new_dates = reversed(new_dates)
columns = ['NumMentions', 'GoldMentions', 'GoldAverage']
temp_data = {col: [None for x in range(1, day_difference)] for col in columns}
missing = pd.DataFrame(temp_data, index=new_dates, columns=columns)
temp_goldstein = missing.append(goldstein)
goldstein = temp_goldstein.fillna(method='bfill')
print 'Insufficient data, missing entries were backfilled.'
else:
print 'Sufficient data, backfilling was avoided.'
Sufficient data, backfilling was avoided.
#Create two instances of the data, one to train the model...
training = pd.DataFrame(goldstein['GoldAverage'])
training.index = pd.to_datetime(training.index)
training.columns = ['Goldstein daily average']
print 'Test Sample {}\n'.format(training.tail())
# ... the other to create the plot:
plot_sample = pd.DataFrame(goldstein['GoldAverage'][-230:])
plot_sample.index = pd.to_datetime(plot_sample.index)
plot_sample.columns = ['Goldstein daily average']
print 'Plot Sample {}'.format(plot_sample.tail())
Test Sample Goldstein daily average 2015-08-03 1.599440 2015-08-04 2.368912 2015-08-05 -1.088742 2015-08-06 1.525023 2015-08-07 2.479502 Plot Sample Goldstein daily average 2015-08-03 1.599440 2015-08-04 2.368912 2015-08-05 -1.088742 2015-08-06 1.525023 2015-08-07 2.479502
# Creates the forcasting model using Autoregressive Moving Average (ARMA):
tries = 0
success = False
while tries < 6 and success is False:
try:
model = sm.tsa.ARMA(training,(12, tries)).fit()
success = True
except:
tries += 1
if success is False:
print "NOT SUCCESSFUL"
print model
<statsmodels.tsa.arima_model.ARMAResultsWrapper object at 0x110ef2cd0>
# Creates the prediction based on the proceeding two weeks through one week into the future:
prediction = model.predict(twoweeksago, str(nextweek), dynamic = False)
# The fitting of the prediction to the model:
prediction = prediction.shift(-1)
def stir_the_pot(sample_range, st_dev):
mean = sample_range.mean()
added_range = mean + st_dev
random_suggestion = None
# Randomly suggest something either above or below the stagnant Goldstein range:
if random.randint(0,1) == 1:
random_suggestion = "{0:.1f}".format(random.uniform(float(added_range),10))
else:
random_suggestion = "{0:.1f}".format(random.uniform(-10,-float(added_range)))
print 'new suggestion: {}'.format(random_suggestion)
return float(random_suggestion)
# Based on the Prediction, determine the Suggestion:
sample_window = training[-60:]
standard_deviation = float(sample_window.std())
suggestion = None
if standard_deviation < 2.1 and random.randint(1,5) > 2:
# The leader has been stuck in a rut and is not honing in on Goldstein Score of '1', 3/5 chance of getting a new suggestion:
print 'Your actions are too stagnant!'
suggestion = stir_the_pot(sample_window, standard_deviation)
predicts = round(prediction.ix[tomorrow:str(nextweek)].mean(), 1)
else:
# Finds the average of the Goldstein scores for the coming week:
predicts = round(prediction.ix[tomorrow:str(nextweek)].mean(), 1)
suggestion = round(((predicts - 1) * -1), 1)
print 'suggestion: {}'.format(suggestion)
Your actions are too stagnant! new suggestion: -5.4 suggestion: -5.4
# From the possible suggestions for the score, find one at random:
prediction_text = random.choice(gsCodes.loc[predicts].values[0].split(';')).strip()
gsDescription = random.choice(gsCodes.loc[suggestion].values[0].split(';')).strip()
print '=================================================='
print "{}'s Forecast: {} {} ".format(leader_display_name, predicts, prediction_text)
print "{}'s Suggested score: {} ".format(leader_display_name, suggestion)
if predicts == 1.0:
gsDescription = "You're doing great. Stay the course."
else:
print "Suggested actions for the coming week:\n{}".format(gsDescription)
print '=================================================='
================================================== Pope Francis's Forecast: 1.7 Make an apology Pope Francis's Suggested score: -5.4 Suggested actions for the coming week: Perform a non-military demonstration or walk out on something ==================================================
plot_sample.plot(kind='line',
title='{}: Goldstein Trend and Prediction'.format(leader_display_name),
ylim=(-10,10),
figsize=(16,8),
color='lightblue')
prediction.plot(kind='line',
label='prediction',
legend=True,
color='red')
<matplotlib.axes.AxesSubplot at 0x11151a610>