%matplotlib inline

import time
import calendar
import codecs
import datetime
import json
import sys
import gzip
import string
import glob
import os
import numpy as np

if ( sys.version_info.major == 3 ):
    from functools import reduce

tweetPath = os.path.join("data_files", "twitter")
tweetFiles = {
   "time01": os.path.join(tweetPath, "statuses.*.gz")
}

frequencyMap = {}
globalTweetCounter = 0

timeFormat = "%a %b %d %H:%M:%S +0000 %Y"

reader = codecs.getreader("utf-8")

for (key, path) in tweetFiles.items():
    localTweetList = []
    for filePath in glob.glob(path):
        print ("Reading File:", filePath)
        
        for line in gzip.open(filePath, 'rb'):

            # Try to read tweet JSON into object
            tweetObj = None
            try:
                tweetObj = json.loads(reader.decode(line)[0])
            except Exception as e:
                continue

            # Deleted status messages and protected status must be skipped
            if ( "delete" in tweetObj.keys() or "status_withheld" in tweetObj.keys() ):
                continue

            # Try to extract the time of the tweet
            try:
                currentTime = datetime.datetime.strptime(tweetObj['created_at'], timeFormat)
            except:
                print (line)
                raise

            currentTime = currentTime.replace(second=0)
            
            # Increment tweet count
            globalTweetCounter += 1
            
            # If our frequency map already has this time, use it, otherwise add
            if ( currentTime in frequencyMap.keys() ):
                timeMap = frequencyMap[currentTime]
                timeMap["count"] += 1
                timeMap["list"].append(tweetObj)
            else:
                frequencyMap[currentTime] = {"count":1, "list":[tweetObj]}

# Fill in any gaps
times = sorted(frequencyMap.keys())
firstTime = times[0]
lastTime = times[-1]
thisTime = firstTime

timeIntervalStep = datetime.timedelta(0, 60)    # Time step in seconds
while ( thisTime <= lastTime ):
    if ( thisTime not in frequencyMap.keys() ):
        frequencyMap[thisTime] = {"count":0, "list":[]}
        
    thisTime = thisTime + timeIntervalStep

print ("Processed Tweet Count:", globalTweetCounter)

import matplotlib.pyplot as plt

fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Tweet Frequency")

# Sort the times into an array for future use
sortedTimes = sorted(frequencyMap.keys())

# What time span do these tweets cover?
print ("Time Frame:", sortedTimes[0], sortedTimes[-1])

# Get a count of tweets per minute
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]

# We'll have ticks every thirty minutes (much more clutters the graph)
smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

# Plot the post frequency
ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax.grid(b=True, which=u'major')
ax.legend()

plt.show()

# Create maps for holding counts and tweets for each user
globalUserCounter = {}
globalUserMap = {}

# Iterate through the time stamps
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    # For each tweet, pull the screen name and add it to the list
    for tweet in timeObj["list"]:
        user = tweet["user"]["screen_name"]
        
        if ( user not in globalUserCounter ):
            globalUserCounter[user] = 1
            globalUserMap[user] = [tweet]
        else:
            globalUserCounter[user] += 1
            globalUserMap[user].append(tweet)

print ("Unique Users:", len(globalUserCounter.keys()))

sortedUsers = sorted(globalUserCounter, key=globalUserCounter.get, reverse=True)
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
    print (u, globalUserCounter[u], "\n\t", "Random Tweet:", globalUserMap[u][0]["text"], "\n----------")

import tweepy

consumer_key = "RfWoIb9wocCY0kOYKUYnf5VOo"
consumer_secret = "FqsdZGdD4yvzwPj0yoe7lHRxgG4tjz2WVZbozxpOPnDunMhzv9"
access_token = "2421639553-0IF33x71RsEJL2aKCksu0C1VR8383nqRQK0dYSE"
access_token_secret = "3wSJCvLhgPBi8NUNVWbvosK2DAraGgB9K0NN0URNLVWjs"

# Set up the authorization mechanisms for Tweepy to access Twitter's API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.secure = True
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
    print (u, globalUserCounter[u])

    # Get user info
    try:
        user = api.get_user(u)
        print ("\tDescription:", user.description)
    except Exception as te:
        print ("\tDescription Error:", te)
        
    print ("----------")

plt.figure(figsize=(16,8))
    
# the histogram of the data
plt.hist(
    [globalUserCounter[x] for x in globalUserCounter], 
    bins=100, 
    normed=0, 
    alpha=0.75,
    label="Counts",
    log=True)

plt.xlabel('Number of Tweets')
plt.ylabel('Counts')
plt.title("Histogram of Frequency")
plt.grid(True)
plt.legend()

plt.show()

avgPostCount = np.mean([globalUserCounter[x] for x in globalUserCounter])
print("Average Number of Posts:", avgPostCount)

# A map for hashtag counts
hashtagCounter = {}

# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        hashtagList = tweet["entities"]["hashtags"]
        
        for hashtagObj in hashtagList:
            
            # We lowercase the hashtag to avoid duplicates (e.g., #MikeBrown vs. #mikebrown)
            hashtagString = hashtagObj["text"].lower()
            
            if ( hashtagString not in hashtagCounter ):
                hashtagCounter[hashtagString] = 1
            else:
                hashtagCounter[hashtagString] += 1

print ("Unique Hashtags:", len(hashtagCounter.keys()))
sortedHashtags = sorted(hashtagCounter, key=hashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags:")
for ht in sortedHashtags[:20]:
    print ("\t", "#" + ht, hashtagCounter[ht])

# What keywords are we interested in?
targetKeywords = ["obama", "tear gas"]
# targetKeywords.append("lowery")
# targetKeywords.append("reilly")
targetKeywords.append("iraq")

# Build an empty map for each keyword we are seaching for
targetCounts = {x:[] for x in targetKeywords}
totalCount = []

# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    # Temporary counter for this minute
    localTargetCounts = {x:0 for x in targetKeywords}
    localTotalCount = 0
    
    for tweetObj in timeObj["list"]:
        tweetString = tweetObj["text"].lower()

        localTotalCount += 1
        
        # Add to the counter if the target keyword is in this tweet
        for keyword in targetKeywords:
            if ( keyword in tweetString ):
                localTargetCounts[keyword] += 1
                
    # Add the counts for this minute to the main counter
    totalCount.append(localTotalCount)
    for keyword in targetKeywords:
        targetCounts[keyword].append(localTargetCounts[keyword])
        
# Now plot the total frequency and frequency of each keyword
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Tweet Frequency")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

ax.plot(range(len(frequencyMap)), totalCount, label="Total")

for keyword in targetKeywords:
    ax.plot(range(len(frequencyMap)), targetCounts[keyword], label=keyword)
ax.legend()
ax.grid(b=True, which=u'major')

plt.show()

# A map for counting each language
languageCounter = {}

for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        lang = tweet["lang"]
        
        if ( lang not in languageCounter ):
            languageCounter[lang] = 1
        else:
            languageCounter[lang] += 1
            

languages = sorted(languageCounter.keys(), key=languageCounter.get, reverse=True)

for l in languages:
    print (l, languageCounter[l])

plt.figure(figsize=(16,8))
    
# the histogram of the data
plt.bar(
    np.arange(len(languages)),
    [languageCounter[x] for x in languages],
    log=True)

plt.xticks(np.arange(len(languages)) + 0.5, languages)
plt.xlabel('Languages')
plt.ylabel('Counts (Log)')
plt.title("Language Frequency")
plt.grid(True)

plt.show()

# A frequency map for timestamps to geo-coded tweets
geoFrequencyMap = {}
geoCount = 0

# Save only those tweets with tweet['coordinate']['coordinate'] entity
for t in sortedTimes:
    geos = list(filter(lambda tweet: tweet["coordinates"] != None and "coordinates" in tweet["coordinates"], frequencyMap[t]["list"]))
    geoCount += len(geos)
    
    # Add to the timestamp map
    geoFrequencyMap[t] = {"count": len(geos), "list": geos}

print ("Number of Geo Tweets:", geoCount)

fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Geo Tweet Frequency")

postFreqList = [geoFrequencyMap[x]["count"] for x in sortedTimes]

smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=45)

ax.plot(range(len(geoFrequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax.grid(b=True, which=u'major')
ax.legend()

plt.show()

import matplotlib

from mpl_toolkits.basemap import Basemap

# Create a list of all geo-coded tweets
tmpGeoList = [geoFrequencyMap[t]["list"] for t in sortedTimes]
geoTweets = reduce(lambda x, y: x + y, tmpGeoList)

# For each geo-coded tweet, extract its GPS coordinates
geoCoord = [x["coordinates"]["coordinates"] for x in geoTweets]

# Now we build a map of the world using Basemap
land_color = 'lightgray'
water_color = 'lightblue'

fig, ax = plt.subplots(figsize=(24,24))
worldMap = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80,
                   llcrnrlon=-180, urcrnrlon=180, resolution='l')

worldMap.fillcontinents(color=land_color, lake_color=water_color, zorder=1)
worldMap.drawcoastlines()
worldMap.drawparallels(np.arange(-90.,120.,30.))
worldMap.drawmeridians(np.arange(0.,420.,60.))
worldMap.drawmapboundary(fill_color=water_color, zorder=0)
ax.set_title('World Tweets')

# Convert points from GPS coordinates to (x,y) coordinates
convPoints = [worldMap(p[0], p[1]) for p in geoCoord]
x = [p[0] for p in convPoints]
y = [p[1] for p in convPoints]
worldMap.scatter(x, y, s=100, marker='x', color="red", zorder=2)

plt.show()

# Create a new map to hold the shape file data
stLouisMap = Basemap(llcrnrlon=-130, llcrnrlat=22, urcrnrlon=-64,
                     urcrnrlat=52, projection='merc', lat_1=33, lat_2=45,
                     lon_0=-95, resolution='i', area_thresh=10000)

# Read in the shape file
moStateShapeFile = os.path.join("data_files", "moCountyShapes", "tl_2010_29_county10")
shp_info = stLouisMap.readshapefile(moStateShapeFile, 'states', drawbounds=True)

# Find only those polygons that describe St. Louis county
stLouisCountyPolygons = []
for (shapeDict, shape) in zip(stLouisMap.states_info, stLouisMap.states):
    if (shapeDict["NAME10"] == "St. Louis"):
        stLouisCountyPolygons.append(matplotlib.patches.Polygon(shape))
        
print ("Shape Count:", len(stLouisCountyPolygons))

# Maps of timestamps to tweets for inside/outside Ferguson
inStLouisFreqMap = {}
outStLouisFreqMap = {}

# For each geo-coded tweet, extract coordinates and conver them to the Basemap space
for t in sortedTimes:
    geos = geoFrequencyMap[t]["list"]
    convPoints = [(stLouisMap(tw["coordinates"]["coordinates"][0], tw["coordinates"]["coordinates"][1]), tw) for tw in geos]

    # Local counters for this time
    inStLouisFreqMap[t] = {"count": 0, "list": []}
    outStLouisFreqMap[t] = {"count": 0, "list": []}
    
    # For each point, check if it is within St. Louis county or not
    for point in convPoints:
        x = point[0][0]
        y = point[0][1]

        inStLouisFlag = False

        for polygon in stLouisCountyPolygons:
            if ( polygon.contains_point((x, y)) ):
                inStLouisFreqMap[t]["list"].append(point[1])
                inStLouisFlag = True
                break

        if ( inStLouisFlag == False ):
            outStLouisFreqMap[t]["list"].append(point[1])

print ("Tweets in St. Louis:", np.sum([len(inStLouisFreqMap[t]["list"]) for t in sortedTimes]))
print ("Tweets outside St. Louis:", np.sum([len(outStLouisFreqMap[t]["list"]) for t in sortedTimes]))

inStLouisTweets = reduce(lambda x, y: x + y, [inStLouisFreqMap[t]["list"] for t in sortedTimes])

userCounter = {}
userMap = {}

for tweet in inStLouisTweets:
    user = tweet["user"]["screen_name"]

    if ( user not in userCounter ):
        userCounter[user] = 1
        userMap[user] = [tweet]
    else:
        userCounter[user] += 1
        userMap[user].append(tweet)

print ("Unique Users in St. Louis:", len(userCounter.keys()))
sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True)

print("Top Users in Ferguson:")
for u in sortedUsers[:10]:
    print (u, userCounter[u])
    
    # Get user info
    try:
        user = api.get_user(u)
        print ("\t", user.description)
    except Exception as te:
        print ("\t", te)
        
    print ("\t", userMap[u][0]["text"], "\n----------")

outStLouisTweets = reduce(lambda x, y: x + y, [outStLouisFreqMap[t]["list"] for t in sortedTimes])

userCounter = {}
userMap = {}

for tweet in outStLouisTweets:
    user = tweet["user"]["screen_name"]

    if ( user not in userCounter ):
        userCounter[user] = 1
        userMap[user] = [tweet]
    else:
        userCounter[user] += 1
        userMap[user].append(tweet)

print ("Unique Users outside St. Louis:", len(userCounter.keys()))
sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True)

print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
    print (u, userCounter[u])
    
    # Get user info
    try:
        user = api.get_user(u)
        print ("\t", user.description)
    except Exception as te:
        print ("\t", te)
        
    print ("\t", userMap[u][0]["text"], "\n----------")

inStlHashtagCounter = {}

for tweet in inStLouisTweets:
    hashtagList = tweet["entities"]["hashtags"]

    for hashtagObj in hashtagList:
        hashtagString = hashtagObj["text"].lower()

        if ( hashtagString not in inStlHashtagCounter ):
            inStlHashtagCounter[hashtagString] = 1
        else:
            inStlHashtagCounter[hashtagString] += 1

print ("Unique Hashtags in Ferguson:", len(inStlHashtagCounter.keys()))
sortedInStlHashtags = sorted(inStlHashtagCounter, key=inStlHashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags in Ferguson:")
for ht in sortedInStlHashtags[:20]:
    print ("\t", "#" + ht, inStlHashtagCounter[ht])

outStlHashtagCounter = {}

for tweet in outStLouisTweets:
    hashtagList = tweet["entities"]["hashtags"]

    for hashtagObj in hashtagList:
        hashtagString = hashtagObj["text"].lower()

        if ( hashtagString not in outStlHashtagCounter ):
            outStlHashtagCounter[hashtagString] = 1
        else:
            outStlHashtagCounter[hashtagString] += 1

print ("Unique Hashtags Outside Ferguson:", len(outStlHashtagCounter.keys()))
sortedOutStlHashtags = sorted(outStlHashtagCounter, key=outStlHashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags Outside Ferguson:")
for ht in sortedOutStlHashtags[:20]:
    print ("\t", "#" + ht, outStlHashtagCounter[ht])

hourlyInterval = {}

for t in sortedTimes:
    newTime = t.replace(second=0, minute=0)
    
    currentTimeObject = frequencyMap[t]
    if ( newTime not in hourlyInterval ):
        hourlyInterval[newTime] = {
            "count": currentTimeObject["count"],
            "list": currentTimeObject["list"]
            }
    else:
        hourlyInterval[newTime]["count"] += currentTimeObject["count"]
        hourlyInterval[newTime]["list"] = hourlyInterval[newTime]["list"] + currentTimeObject["list"]

from IPython.display import display
from IPython.display import Image

for h in sorted(hourlyInterval.keys()):
    noRetweets = list(filter(lambda tweet: not tweet["text"].lower().startswith("rt"), hourlyInterval[h]["list"]))
    tweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], noRetweets))
    print (h, hourlyInterval[h]["count"], len(tweetsWithMedia), )
    
    randIndex = np.random.random_integers(0, len(tweetsWithMedia)-1, size=1)
    imgUrl = tweetsWithMedia[randIndex]["entities"]["media"][0]["media_url"]
    display(Image(url=imgUrl))

stlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], inStLouisTweets))
print ("Tweets with Media:", len(stlTweetsWithMedia))

for tweet in stlTweetsWithMedia:
    imgUrl = tweet["entities"]["media"][0]["media_url"]
    display(Image(url=imgUrl))

outStlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], outStLouisTweets))
print ("Tweets outside St. Louis with Media:", len(outStlTweetsWithMedia))

np.random.shuffle(outStlTweetsWithMedia)
for tweet in outStlTweetsWithMedia[:10]:
    imgUrl = tweet["entities"]["media"][0]["media_url"]
    display(Image(url=imgUrl))

import re

# Read in the sentiment/valence files
dataFilePath = os.path.join("data_files", "SentiStrength")
valenceFile = os.path.join(dataFilePath, "EmotionLookupTable.txt")
emoticonFile = os.path.join(dataFilePath, "EmoticonLookupTable.txt")

valenceList = []

# Open the valence file and read in each word/valence pair
for line in open(valenceFile, "r"):
    # Split the line based on tabs and select the first two elements
    (word, valence) = line.split("\t")[:2]
    
    wordRegex = re.compile(word)
    valencePair = (wordRegex, int(valence))
    valenceList.append(valencePair)
    
# Open the emoticon file and read in the valence for each emoticon
for line in codecs.open(emoticonFile, "r", "utf-8"):
    # Split the line based on tabs and select the first two elements
    (emoticon, valence) = line.split("\t")[:2]
    
    emoticonRegex = re.compile(re.escape(emoticon))
    valencePair = (emoticonRegex, int(valence))
    valenceList.append(valencePair)
    
print ("Number of Sentiment Keywords:", len(valenceList))

# Examples of sentiment pairs
for i in np.random.random_integers(0, len(valenceList)-1, 10):
    print(valenceList[i][0].pattern, "\t", valenceList[i][1])

# Generate sentiment measures for each time
timeSentiments = {}
for t in sortedTimes:
    
    tweetList = frequencyMap[t]["list"]
    sentimentList = []
    thisMinuteSentiment = None
    
    for tweet in tweetList:
        
        # Calculate the average sentiment for this tweet
        tweetText = tweet["text"].lower()

        # skip retweets
        if ( tweetText.startswith("rt ") ):
            continue

        valCount = 0
        valSum = 0.0
        valAvg = 0.0
        for valencePair in valenceList:
            if ( valencePair[0].search(tweetText) is not None ):
                valCount += 1
                valSum += valencePair[1]

        if ( valCount > 0 ):
            valAvg = valSum / valCount
            sentimentList.append(valAvg)
    
    if ( len(sentimentList) > 0 ):
        thisMinuteSentiment = np.array(sentimentList).mean()
    else:
        thisMinuteSentiment = 0.0
        
    timeSentiments[t] = thisMinuteSentiment

fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Sentiment Over Time")

postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]
sentList = [timeSentiments[x] for x in sortedTimes]

smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")

ax2 = ax.twinx()
ax2.plot([0], [0], color="blue", label="Posts")
ax2.plot(range(len(frequencyMap)), sentList, color="green", label="Sentiment")
ax2.set_ylim(-6,6)

ax.grid(b=True, which=u'major')
ax2.legend()

plt.show()

fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Sentiment Histrogram")

for (loc, (tweetList, color)) in {"Inside": (inStLouisTweets, "green"), "Outside": (outStLouisTweets, "blue")}.items():

    localSentimentList = []
    for tweet in tweetList:

        # Calculate the average sentiment for this tweet
        tweetText = tweet["text"].lower()

        # skip retweets
        if ( tweetText.startswith("rt ") ):
            continue

        valCount = 0
        valSum = 0.0
        valAvg = 0.0
        for valencePair in valenceList:
            if ( valencePair[0].search(tweetText) is not None ):
                valCount += 1
                valSum += valencePair[1]

        if ( valCount > 0 ):
            valAvg = valSum / valCount
            localSentimentList.append(valAvg)

    print("Number of Sentiment Tweets:", len(localSentimentList))

    ax.hist(localSentimentList, range=(-5, 5), normed=True, alpha=0.5, color=color, label=loc)

ax.grid(b=True, which=u'major')
ax.legend()

plt.show()

import gensim.models.ldamodel
import gensim.matutils
import sklearn.cluster
import sklearn.feature_extraction 
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing

from nltk.corpus import stopwords

enFilter = lambda x: True if x["lang"] == "en" else False

# Get all tweets, filter out retweets, save only those in English, and conver to lowercase
allTweetList = reduce(lambda x, y: x + y, [frequencyMap[t]["list"] for t in sortedTimes])
noRetweetsList = list(filter(lambda x: not x["text"].lower().startswith("rt"), allTweetList))
onlyEnglishTweets = list(filter(enFilter, noRetweetsList))
lowerTweetText = [x["text"].lower() for x in onlyEnglishTweets]

print ("All Tweet Count:", len(allTweetList))
print ("Reduced Tweet Count:", len(lowerTweetText))

enStop = stopwords.words('english')

# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = enStop + ["http", "https", "rt", "@", ":"]

vectorizer = sklearn.feature_extraction.text.CountVectorizer(strip_accents='unicode', 
                                                             tokenizer=None,
                                                             token_pattern='(?u)#?\\b\\w+[\'-]?\\w+\\b',
                                                             stop_words=stopList,
                                                             binary=True)
# Create a vectorizer for all our content
vectorizer.fit(lowerTweetText)

# Get all the words in our text
names = vectorizer.get_feature_names()

# Create a map for vectorizer IDs to words
id2WordDict = dict(zip(range(len(vectorizer.get_feature_names())), names))

# Create a corpus for 
corpus = vectorizer.transform(lowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
# lda = gensim.models.ldamodel.LdaModel(gsCorpus, id2word=id2WordDict, num_topics=10)
lda = gensim.models.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=100, passes=2)

ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
    print ("Topic:", i)
    for token in topicTokens[i]:
        print ("\t", token)

inStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, inStLouisTweets)]

corpus = vectorizer.transform(inStlLowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=10, passes=10)

ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
    print ("Topic:", i)
    for token in topicTokens[i]:
        print ("\t", token)

outStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, outStLouisTweets)]

corpus = vectorizer.transform(outStlLowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=50, passes=10)

ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
    print ("Topic:", i)
    for token in topicTokens[i]:
        print ("\t", token)

import networkx as nx

graph = nx.DiGraph()

geoCodedMap = {1: inStLouisTweets, 0: outStLouisTweets}

for (location, locationList) in geoCodedMap.items():
    print (location, len(locationList))
    
    for tweet in locationList:
        userName = tweet["user"]["screen_name"]
        graph.add_node(userName, loc=location)

        mentionList = tweet["entities"]["user_mentions"]
        
        for otherUser in mentionList:
            otherUserName = otherUser["screen_name"]
            if ( graph.has_node(otherUserName) == False ):
                graph.add_node(otherUserName, loc=-1)
            graph.add_edge(userName, otherUserName)
        
print ("Number of Users:", len(graph.node))

pageRankList = nx.pagerank_numpy(graph)

highRankNodes = sorted(pageRankList.keys(), key=pageRankList.get, reverse=True)
for x in highRankNodes[:20]:
    user = api.get_user(x)
    print (x, pageRankList[x], "\n\t", user.description, "\n----------")
    

print (len(graph.nodes(data=True)))

colors = [0.9 if x[1]["loc"] == 1 else 0.1 for x in graph.nodes(data=True)]
pos = {x:(np.random.rand(2) * 10) for x in graph.nodes()}
nx.draw_networkx_nodes(graph, pos, node_color=colors)
nx.draw_networkx_edges(graph, pos)

nx.write_graphml(graph, "inVsOutNetwork.graphml", encoding='utf-8', prettyprint=False)


# If you want to play with the full graph, 
# here is code that will build it up for you.
# Be careful. It's large.

fullGraph = nx.DiGraph()

inStlUsers = set(map(lambda x: x["user"]["screen_name"], inStLouisTweets))
outStlUsers = set(map(lambda x: x["user"]["screen_name"], outStLouisTweets))

for (userName, tweetList) in globalUserMap.items():
    
    location = -1
    if ( userName in inStlUsers ):
        location = 1
    elif (userName in outStlUsers ):
        location = 0
        
    fullGraph.add_node(userName, loc=location)

    for tweet in tweetList:
        mentionList = tweet["entities"]["user_mentions"]

        for otherUser in mentionList:
            otherUserName = otherUser["screen_name"]
            if ( fullGraph.has_node(otherUserName) == False ):
                fullGraph.add_node(otherUserName, loc=-1)
            fullGraph.add_edge(userName, otherUserName)
            
print ("Number of Users:", len(fullGraph.node))

nx.write_graphml(fullGraph, "fullNetwork.graphml", encoding='utf-8', prettyprint=False)