#!/usr/bin/env python # coding: utf-8 # #International Political Tele-Conferencing Equilibrium System (IPTCES) algorithm # The following is a public-facing walkthrough of the code used to for the predictive tool that powers [@WorldLeaderTips](https://twitter.com/worldleadertips). The styling of the out graph has been changed for this context. # In[1]: get_ipython().run_line_magic('pylab', 'inline') # In[2]: import json import random import datetime as dt from collections import deque import numpy as np import pandas as pd from pandas import * from pandas.io.json import json_normalize import matplotlib.pylab import matplotlib.pyplot as plt import matplotlib.dates as mdates import statsmodels.api as sm from statsmodels.tsa import * from bigquery import get_client # In[3]: tomorrow = str(dt.date.today()) twoweeksago = str(dt.date.today() - dt.timedelta(days = 14)) lastweek = dt.datetime.today() - dt.timedelta(days = 7) nextweek = dt.date.today() + dt.timedelta(days = 8) day_index = dt.datetime.today().weekday() # In[4]: # Initialize Google BigQuery client: with open('/PATH/TO/settings.json', 'rb') as json_data: settings = json.load(json_data) client = get_client(settings['PROJECT_ID'], service_account=settings['SERVICE_ACCOUNT'], private_key=settings['KEY'], readonly=True) # In[5]: # Load the Goldstein suggstions table: gsCodes = pd.read_csv('assets/goldstein_suggestions.tsv', index_col='code', sep='\t') # In[6]: leader_display_name = 'Pope Francis' gdelt_search = "WHERE Actor1Name CONTAINS 'POPE' AND SQLDATE > 20130313" date_start = '19790101' date_end = ''.join(str(dt.date.today()).split('-')) query_template = 'SELECT GLOBALEVENTID, SQLDATE, Actor1Name, Actor1CountryCode, GoldsteinScale, NumMentions FROM [gdelt-bq:full.events] {} AND SQLDATE>{} AND SQLDATE<{} IGNORE CASE;' query = query_template.format(gdelt_search, date_start, date_end) print 'Search for: {}'.format(leader_display_name) print 'Query: {}'.format(query) # In[7]: filename = '{}.csv'.format(leader_display_name.replace(' ', '_')) # In[8]: try: job_id, results = client.query(query, timeout=2000) print '{} results'.format(str(len(results))) if len(results) == 0: print "NO RESULTS" df = json_normalize(results) df = df.sort(columns='SQLDATE') df.to_csv('{}'.format(filename), encoding='utf-8') print 'Saved: {}'.format(filename) except: print 'ERROR: Timed out' # In[9]: df = pd.read_csv(filename, index_col='SQLDATE', parse_dates=True) df.tail() # In[10]: # These steps incorporate the number of mentions a Goldstein score is associated with, reducing the impact of error in the event encoding, making the average better reflect the event's presence in the GDELT. df['GoldMentions'] = df['GoldsteinScale'] * df['NumMentions'] goldstein = df.groupby([df.index.date]).agg({'GoldMentions': np.sum, 'NumMentions': np.sum}) goldstein['GoldAverage'] = goldstein['GoldMentions'] / goldstein['NumMentions'] # In[11]: full_daterange = pd.date_range(start=min(df.index), end=max(df.index)) # In[12]: # interpolate() takes care of days that do not have a entry / Goldstein score in GDELT: goldstein = goldstein.reindex(full_daterange).interpolate(method='linear') # In[13]: # In case there is not enough historical data for the world leader: if len(goldstein['GoldAverage']) < 230: day_difference = 230 - len(goldstein['GoldAverage']) + 1 new_dates = [goldstein.index[0] - dt.timedelta(days=x) for x in range(1, day_difference)] new_dates = reversed(new_dates) columns = ['NumMentions', 'GoldMentions', 'GoldAverage'] temp_data = {col: [None for x in range(1, day_difference)] for col in columns} missing = pd.DataFrame(temp_data, index=new_dates, columns=columns) temp_goldstein = missing.append(goldstein) goldstein = temp_goldstein.fillna(method='bfill') print 'Insufficient data, missing entries were backfilled.' else: print 'Sufficient data, backfilling was avoided.' # In[14]: #Create two instances of the data, one to train the model... training = pd.DataFrame(goldstein['GoldAverage']) training.index = pd.to_datetime(training.index) training.columns = ['Goldstein daily average'] print 'Test Sample {}\n'.format(training.tail()) # ... the other to create the plot: plot_sample = pd.DataFrame(goldstein['GoldAverage'][-230:]) plot_sample.index = pd.to_datetime(plot_sample.index) plot_sample.columns = ['Goldstein daily average'] print 'Plot Sample {}'.format(plot_sample.tail()) # In[15]: # Creates the forcasting model using Autoregressive Moving Average (ARMA): tries = 0 success = False while tries < 6 and success is False: try: model = sm.tsa.ARMA(training,(12, tries)).fit() success = True except: tries += 1 if success is False: print "NOT SUCCESSFUL" print model # In[16]: # Creates the prediction based on the proceeding two weeks through one week into the future: prediction = model.predict(twoweeksago, str(nextweek), dynamic = False) # The fitting of the prediction to the model: prediction = prediction.shift(-1) # In[17]: def stir_the_pot(sample_range, st_dev): mean = sample_range.mean() added_range = mean + st_dev random_suggestion = None # Randomly suggest something either above or below the stagnant Goldstein range: if random.randint(0,1) == 1: random_suggestion = "{0:.1f}".format(random.uniform(float(added_range),10)) else: random_suggestion = "{0:.1f}".format(random.uniform(-10,-float(added_range))) print 'new suggestion: {}'.format(random_suggestion) return float(random_suggestion) # In[18]: # Based on the Prediction, determine the Suggestion: sample_window = training[-60:] standard_deviation = float(sample_window.std()) suggestion = None if standard_deviation < 2.1 and random.randint(1,5) > 2: # The leader has been stuck in a rut and is not honing in on Goldstein Score of '1', 3/5 chance of getting a new suggestion: print 'Your actions are too stagnant!' suggestion = stir_the_pot(sample_window, standard_deviation) predicts = round(prediction.ix[tomorrow:str(nextweek)].mean(), 1) else: # Finds the average of the Goldstein scores for the coming week: predicts = round(prediction.ix[tomorrow:str(nextweek)].mean(), 1) suggestion = round(((predicts - 1) * -1), 1) print 'suggestion: {}'.format(suggestion) # In[19]: # From the possible suggestions for the score, find one at random: prediction_text = random.choice(gsCodes.loc[predicts].values[0].split(';')).strip() gsDescription = random.choice(gsCodes.loc[suggestion].values[0].split(';')).strip() # In[20]: print '==================================================' print "{}'s Forecast: {} {} ".format(leader_display_name, predicts, prediction_text) print "{}'s Suggested score: {} ".format(leader_display_name, suggestion) if predicts == 1.0: gsDescription = "You're doing great. Stay the course." else: print "Suggested actions for the coming week:\n{}".format(gsDescription) print '==================================================' # In[21]: plot_sample.plot(kind='line', title='{}: Goldstein Trend and Prediction'.format(leader_display_name), ylim=(-10,10), figsize=(16,8), color='lightblue') prediction.plot(kind='line', label='prediction', legend=True, color='red') # In[ ]: