import os import datetime as dt import time import io import numpy as np import requests URL = "http://gdelt.utdallas.edu/data/gkg/" # The GKG data directory PATH = "/Users/dmasad/Data/GDELT/GKG/" # The local directory to store the data # Specify the start and end date for your data start_date = dt.datetime(2013, 10, 5) end_date = dt.datetime.today() date = start_date # For each date in between, download the corresponding file while date <= end_date: filename = date.strftime("%Y%m%d") + ".gkg.csv.zip" req = requests.get(URL + filename) dl = io.open(PATH + filename, "wb") for chunk in req.iter_content(chunk_size=1024): if chunk: dl.write(chunk) dl.close() time.sleep(30) # Be nice and don't overload the server. date += dt.timedelta(days=1) f = open(PATH + "20131001.gkg.csv") headers = f.readline() print headers.split("\t") f.readline() # Skip a row row = f.readline() row = row.split("\t") for entry in row: print entry for entry in row[2].split(";"): print entry.split("#") print row[3].split(";") for entry in row[4].split(";"): print entry.split("#") # PERSONS: print row[5].split(";") # ORGANIZATIONS print row[6].split(";") print row[7].split(",") print row[8].split(",") # SOURCE: print row[9] #SOURCEURL print row[10] LEADERS = ["Ali Hoseini-KHAMENEI", "Hasan Fereidun RUHANI", "Mohsen HAJI-MIRZAIE", "Mohammad NAHAVANDIAN", "Eshaq JAHANGIRI", "Mohammad SHARIATMADARI", "Elham AMINZADEH", "Mohammad Baqer NOBAKHT", "Majid ANSARI", "Mohammad Baqer NOBAKHT", "Sorena SATARI", "Shahindokht MOLAVERDI", "Ali Akbar SALEHI", "Mohammad Ali NAJAFI", "Masumeh EBTEKAR", "Mohammad Ali SHAHADI", "Mohammad HOJJATI", "Mahmud VAEZI-Jazai", "Ali JANATI", "Hosein DEHQAN", "Ali TAYEBNIA", "Ali Asqar FANI", "Hamid CHITCHIAN", "Mohammad Javad ZARIF-Khonsari", "Seyed Hasan QAZIZADEH-Hashemi", "Mohammad Reza NEMATZADEH", "Mahmud ALAVI, Hojjat ol-Eslam", "Abdolreza Rahmani-FAZLI", "Mostafa PUR-MOHAMMADI", "Ali RABIEI", "Bijan Namdar-ZANGANEH", "Abbas Ahmad AKHUNDI", "Reza FARAJI-DANA", "Valiollah SEIF", "Mohammad KHAZAI-Torshizi"] # Convert the names to all lower-case LEADERS = [name.lower() for name in LEADERS] entries = [] for path in os.listdir(PATH): if path[-3:] != "csv": continue f = open(PATH + path) for row in f: actors = row.split("\t")[5].split(";") for actor in actors: if actor in LEADERS: entries.append(actors) break print len(entries) import itertools from collections import defaultdict dyads = defaultdict(int) for entry in entries: for p1, p2 in itertools.combinations(entry, 2): if (p2, p1) in dyads: dyads[(p2, p1)] += 1 else: dyads[(p1, p2)] += 1 import matplotlib.pyplot as plt # Some initial styling, to make our graphs look good: matplotlib.rcParams['axes.facecolor'] = "#eeeeee" matplotlib.rcParams['axes.grid'] = True matplotlib.rcParams['xtick.labelsize'] = 14 matplotlib.rcParams['ytick.labelsize'] = 14 fig = plt.figure(figsize=(20,12)) ax = fig.add_subplot(111) ax.set_yscale('log') h = ax.hist(dyads.values(), bins=np.linspace(1, 250, 26)) counts = np.array(dyads.values()) print len(counts[counts>1])/(1.0*len(counts)) import networkx as nx # Build the graph G = nx.Graph() for dyad, count in dyads.iteritems(): if count > 1: G.add_edge(dyad[0], dyad[1], weight=count) nx.write_graphml(G, "iran.graphml") fig = plt.figure(figsize=(20,20)) ax = fig.add_subplot(111) pos = nx.spring_layout(G, k=0.2, iterations=25) nx.draw_networkx_edges(G, pos=pos, ax=ax, edge_color='#eeeeee') nx.draw_networkx_labels(G, pos=pos, ax=ax, font_size=16) _ = ax.axis('off') eigen_centralities = nx.eigenvector_centrality(G) between_centralities = nx.betweenness_centrality(G) fig = plt.figure(figsize=(20,12)) ax = fig.add_subplot(111) for name in eigen_centralities.keys(): ax.text(eigen_centralities[name], between_centralities[name], name, fontdict={"size": 16}) ax.set_xlabel("Eigenvector Centrality", size=20) ax.set_ylabel("Betweenness Centrality", size=20) fig = plt.figure(figsize=(20,12)) ax = fig.add_subplot(111) for name in eigen_centralities.keys(): x = eigen_centralities[name] y = between_centralities[name] if x < 0.25 and y < 0.2 and y > 0: ax.text(x, y, name, fontdict={"size": 16}) ax.set_xlim(0, 0.2) ax.set_ylim(0, 0.16) ax.set_xlabel("Eigenvector Centrality", size=20) ax.set_ylabel("Betweenness Centrality", size=20) fig = plt.figure(figsize=(20,12)) ax = fig.add_subplot(111) for name in eigen_centralities.keys(): x = eigen_centralities[name] y = between_centralities[name] if x < 0.05 and y < 0.02 and y > 0: ax.text(x, y, name, fontdict={"size": 16}) ax.set_xlim(0, 0.05) ax.set_ylim(0, 0.02) ax.set_xlabel("Eigenvector Centrality", size=20) ax.set_ylabel("Betweenness Centrality", size=20) # Style the code using CSS shamelessly lifted from Bayesian Methods for Hackers # https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers from IPython.core.display import HTML styles = open("Style.css").read() HTML(styles)