%matplotlib inline import time import calendar import codecs import datetime import json import sys import gzip import string import glob import os import numpy as np if ( sys.version_info.major == 3 ): from functools import reduce tweetPath = os.path.join("data_files", "twitter") tweetFiles = { "time01": os.path.join(tweetPath, "statuses.*.gz") } frequencyMap = {} globalTweetCounter = 0 timeFormat = "%a %b %d %H:%M:%S +0000 %Y" reader = codecs.getreader("utf-8") for (key, path) in tweetFiles.items(): localTweetList = [] for filePath in glob.glob(path): print ("Reading File:", filePath) for line in gzip.open(filePath, 'rb'): # Try to read tweet JSON into object tweetObj = None try: tweetObj = json.loads(reader.decode(line)[0]) except Exception as e: continue # Deleted status messages and protected status must be skipped if ( "delete" in tweetObj.keys() or "status_withheld" in tweetObj.keys() ): continue # Try to extract the time of the tweet try: currentTime = datetime.datetime.strptime(tweetObj['created_at'], timeFormat) except: print (line) raise currentTime = currentTime.replace(second=0) # Increment tweet count globalTweetCounter += 1 # If our frequency map already has this time, use it, otherwise add if ( currentTime in frequencyMap.keys() ): timeMap = frequencyMap[currentTime] timeMap["count"] += 1 timeMap["list"].append(tweetObj) else: frequencyMap[currentTime] = {"count":1, "list":[tweetObj]} # Fill in any gaps times = sorted(frequencyMap.keys()) firstTime = times[0] lastTime = times[-1] thisTime = firstTime timeIntervalStep = datetime.timedelta(0, 60) # Time step in seconds while ( thisTime <= lastTime ): if ( thisTime not in frequencyMap.keys() ): frequencyMap[thisTime] = {"count":0, "list":[]} thisTime = thisTime + timeIntervalStep print ("Processed Tweet Count:", globalTweetCounter) import matplotlib.pyplot as plt fig, ax = plt.subplots() fig.set_size_inches(18.5,10.5) plt.title("Tweet Frequency") # Sort the times into an array for future use sortedTimes = sorted(frequencyMap.keys()) # What time span do these tweets cover? print ("Time Frame:", sortedTimes[0], sortedTimes[-1]) # Get a count of tweets per minute postFreqList = [frequencyMap[x]["count"] for x in sortedTimes] # We'll have ticks every thirty minutes (much more clutters the graph) smallerXTicks = range(0, len(sortedTimes), 30) plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90) # Plot the post frequency ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts") ax.grid(b=True, which=u'major') ax.legend() plt.show() # Create maps for holding counts and tweets for each user globalUserCounter = {} globalUserMap = {} # Iterate through the time stamps for t in sortedTimes: timeObj = frequencyMap[t] # For each tweet, pull the screen name and add it to the list for tweet in timeObj["list"]: user = tweet["user"]["screen_name"] if ( user not in globalUserCounter ): globalUserCounter[user] = 1 globalUserMap[user] = [tweet] else: globalUserCounter[user] += 1 globalUserMap[user].append(tweet) print ("Unique Users:", len(globalUserCounter.keys())) sortedUsers = sorted(globalUserCounter, key=globalUserCounter.get, reverse=True) print ("Top Ten Most Prolific Users:") for u in sortedUsers[:10]: print (u, globalUserCounter[u], "\n\t", "Random Tweet:", globalUserMap[u][0]["text"], "\n----------") import tweepy consumer_key = "RfWoIb9wocCY0kOYKUYnf5VOo" consumer_secret = "FqsdZGdD4yvzwPj0yoe7lHRxgG4tjz2WVZbozxpOPnDunMhzv9" access_token = "2421639553-0IF33x71RsEJL2aKCksu0C1VR8383nqRQK0dYSE" access_token_secret = "3wSJCvLhgPBi8NUNVWbvosK2DAraGgB9K0NN0URNLVWjs" # Set up the authorization mechanisms for Tweepy to access Twitter's API auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.secure = True auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) print ("Top Ten Most Prolific Users:") for u in sortedUsers[:10]: print (u, globalUserCounter[u]) # Get user info try: user = api.get_user(u) print ("\tDescription:", user.description) except Exception as te: print ("\tDescription Error:", te) print ("----------") plt.figure(figsize=(16,8)) # the histogram of the data plt.hist( [globalUserCounter[x] for x in globalUserCounter], bins=100, normed=0, alpha=0.75, label="Counts", log=True) plt.xlabel('Number of Tweets') plt.ylabel('Counts') plt.title("Histogram of Frequency") plt.grid(True) plt.legend() plt.show() avgPostCount = np.mean([globalUserCounter[x] for x in globalUserCounter]) print("Average Number of Posts:", avgPostCount) # A map for hashtag counts hashtagCounter = {} # For each minute, pull the list of hashtags and add to the counter for t in sortedTimes: timeObj = frequencyMap[t] for tweet in timeObj["list"]: hashtagList = tweet["entities"]["hashtags"] for hashtagObj in hashtagList: # We lowercase the hashtag to avoid duplicates (e.g., #MikeBrown vs. #mikebrown) hashtagString = hashtagObj["text"].lower() if ( hashtagString not in hashtagCounter ): hashtagCounter[hashtagString] = 1 else: hashtagCounter[hashtagString] += 1 print ("Unique Hashtags:", len(hashtagCounter.keys())) sortedHashtags = sorted(hashtagCounter, key=hashtagCounter.get, reverse=True) print ("Top Twenty Hashtags:") for ht in sortedHashtags[:20]: print ("\t", "#" + ht, hashtagCounter[ht]) # What keywords are we interested in? targetKeywords = ["obama", "tear gas"] # targetKeywords.append("lowery") # targetKeywords.append("reilly") targetKeywords.append("iraq") # Build an empty map for each keyword we are seaching for targetCounts = {x:[] for x in targetKeywords} totalCount = [] # For each minute, pull the tweet text and search for the keywords we want for t in sortedTimes: timeObj = frequencyMap[t] # Temporary counter for this minute localTargetCounts = {x:0 for x in targetKeywords} localTotalCount = 0 for tweetObj in timeObj["list"]: tweetString = tweetObj["text"].lower() localTotalCount += 1 # Add to the counter if the target keyword is in this tweet for keyword in targetKeywords: if ( keyword in tweetString ): localTargetCounts[keyword] += 1 # Add the counts for this minute to the main counter totalCount.append(localTotalCount) for keyword in targetKeywords: targetCounts[keyword].append(localTargetCounts[keyword]) # Now plot the total frequency and frequency of each keyword fig, ax = plt.subplots() fig.set_size_inches(18.5,10.5) plt.title("Tweet Frequency") plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90) ax.plot(range(len(frequencyMap)), totalCount, label="Total") for keyword in targetKeywords: ax.plot(range(len(frequencyMap)), targetCounts[keyword], label=keyword) ax.legend() ax.grid(b=True, which=u'major') plt.show() # A map for counting each language languageCounter = {} for t in sortedTimes: timeObj = frequencyMap[t] for tweet in timeObj["list"]: lang = tweet["lang"] if ( lang not in languageCounter ): languageCounter[lang] = 1 else: languageCounter[lang] += 1 languages = sorted(languageCounter.keys(), key=languageCounter.get, reverse=True) for l in languages: print (l, languageCounter[l]) plt.figure(figsize=(16,8)) # the histogram of the data plt.bar( np.arange(len(languages)), [languageCounter[x] for x in languages], log=True) plt.xticks(np.arange(len(languages)) + 0.5, languages) plt.xlabel('Languages') plt.ylabel('Counts (Log)') plt.title("Language Frequency") plt.grid(True) plt.show() # A frequency map for timestamps to geo-coded tweets geoFrequencyMap = {} geoCount = 0 # Save only those tweets with tweet['coordinate']['coordinate'] entity for t in sortedTimes: geos = list(filter(lambda tweet: tweet["coordinates"] != None and "coordinates" in tweet["coordinates"], frequencyMap[t]["list"])) geoCount += len(geos) # Add to the timestamp map geoFrequencyMap[t] = {"count": len(geos), "list": geos} print ("Number of Geo Tweets:", geoCount) fig, ax = plt.subplots() fig.set_size_inches(18.5,10.5) plt.title("Geo Tweet Frequency") postFreqList = [geoFrequencyMap[x]["count"] for x in sortedTimes] smallerXTicks = range(0, len(sortedTimes), 30) plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=45) ax.plot(range(len(geoFrequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts") ax.grid(b=True, which=u'major') ax.legend() plt.show() import matplotlib from mpl_toolkits.basemap import Basemap # Create a list of all geo-coded tweets tmpGeoList = [geoFrequencyMap[t]["list"] for t in sortedTimes] geoTweets = reduce(lambda x, y: x + y, tmpGeoList) # For each geo-coded tweet, extract its GPS coordinates geoCoord = [x["coordinates"]["coordinates"] for x in geoTweets] # Now we build a map of the world using Basemap land_color = 'lightgray' water_color = 'lightblue' fig, ax = plt.subplots(figsize=(24,24)) worldMap = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80, llcrnrlon=-180, urcrnrlon=180, resolution='l') worldMap.fillcontinents(color=land_color, lake_color=water_color, zorder=1) worldMap.drawcoastlines() worldMap.drawparallels(np.arange(-90.,120.,30.)) worldMap.drawmeridians(np.arange(0.,420.,60.)) worldMap.drawmapboundary(fill_color=water_color, zorder=0) ax.set_title('World Tweets') # Convert points from GPS coordinates to (x,y) coordinates convPoints = [worldMap(p[0], p[1]) for p in geoCoord] x = [p[0] for p in convPoints] y = [p[1] for p in convPoints] worldMap.scatter(x, y, s=100, marker='x', color="red", zorder=2) plt.show() # Create a new map to hold the shape file data stLouisMap = Basemap(llcrnrlon=-130, llcrnrlat=22, urcrnrlon=-64, urcrnrlat=52, projection='merc', lat_1=33, lat_2=45, lon_0=-95, resolution='i', area_thresh=10000) # Read in the shape file moStateShapeFile = os.path.join("data_files", "moCountyShapes", "tl_2010_29_county10") shp_info = stLouisMap.readshapefile(moStateShapeFile, 'states', drawbounds=True) # Find only those polygons that describe St. Louis county stLouisCountyPolygons = [] for (shapeDict, shape) in zip(stLouisMap.states_info, stLouisMap.states): if (shapeDict["NAME10"] == "St. Louis"): stLouisCountyPolygons.append(matplotlib.patches.Polygon(shape)) print ("Shape Count:", len(stLouisCountyPolygons)) # Maps of timestamps to tweets for inside/outside Ferguson inStLouisFreqMap = {} outStLouisFreqMap = {} # For each geo-coded tweet, extract coordinates and conver them to the Basemap space for t in sortedTimes: geos = geoFrequencyMap[t]["list"] convPoints = [(stLouisMap(tw["coordinates"]["coordinates"][0], tw["coordinates"]["coordinates"][1]), tw) for tw in geos] # Local counters for this time inStLouisFreqMap[t] = {"count": 0, "list": []} outStLouisFreqMap[t] = {"count": 0, "list": []} # For each point, check if it is within St. Louis county or not for point in convPoints: x = point[0][0] y = point[0][1] inStLouisFlag = False for polygon in stLouisCountyPolygons: if ( polygon.contains_point((x, y)) ): inStLouisFreqMap[t]["list"].append(point[1]) inStLouisFlag = True break if ( inStLouisFlag == False ): outStLouisFreqMap[t]["list"].append(point[1]) print ("Tweets in St. Louis:", np.sum([len(inStLouisFreqMap[t]["list"]) for t in sortedTimes])) print ("Tweets outside St. Louis:", np.sum([len(outStLouisFreqMap[t]["list"]) for t in sortedTimes])) inStLouisTweets = reduce(lambda x, y: x + y, [inStLouisFreqMap[t]["list"] for t in sortedTimes]) userCounter = {} userMap = {} for tweet in inStLouisTweets: user = tweet["user"]["screen_name"] if ( user not in userCounter ): userCounter[user] = 1 userMap[user] = [tweet] else: userCounter[user] += 1 userMap[user].append(tweet) print ("Unique Users in St. Louis:", len(userCounter.keys())) sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True) print("Top Users in Ferguson:") for u in sortedUsers[:10]: print (u, userCounter[u]) # Get user info try: user = api.get_user(u) print ("\t", user.description) except Exception as te: print ("\t", te) print ("\t", userMap[u][0]["text"], "\n----------") outStLouisTweets = reduce(lambda x, y: x + y, [outStLouisFreqMap[t]["list"] for t in sortedTimes]) userCounter = {} userMap = {} for tweet in outStLouisTweets: user = tweet["user"]["screen_name"] if ( user not in userCounter ): userCounter[user] = 1 userMap[user] = [tweet] else: userCounter[user] += 1 userMap[user].append(tweet) print ("Unique Users outside St. Louis:", len(userCounter.keys())) sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True) print ("Top Ten Most Prolific Users:") for u in sortedUsers[:10]: print (u, userCounter[u]) # Get user info try: user = api.get_user(u) print ("\t", user.description) except Exception as te: print ("\t", te) print ("\t", userMap[u][0]["text"], "\n----------") inStlHashtagCounter = {} for tweet in inStLouisTweets: hashtagList = tweet["entities"]["hashtags"] for hashtagObj in hashtagList: hashtagString = hashtagObj["text"].lower() if ( hashtagString not in inStlHashtagCounter ): inStlHashtagCounter[hashtagString] = 1 else: inStlHashtagCounter[hashtagString] += 1 print ("Unique Hashtags in Ferguson:", len(inStlHashtagCounter.keys())) sortedInStlHashtags = sorted(inStlHashtagCounter, key=inStlHashtagCounter.get, reverse=True) print ("Top Twenty Hashtags in Ferguson:") for ht in sortedInStlHashtags[:20]: print ("\t", "#" + ht, inStlHashtagCounter[ht]) outStlHashtagCounter = {} for tweet in outStLouisTweets: hashtagList = tweet["entities"]["hashtags"] for hashtagObj in hashtagList: hashtagString = hashtagObj["text"].lower() if ( hashtagString not in outStlHashtagCounter ): outStlHashtagCounter[hashtagString] = 1 else: outStlHashtagCounter[hashtagString] += 1 print ("Unique Hashtags Outside Ferguson:", len(outStlHashtagCounter.keys())) sortedOutStlHashtags = sorted(outStlHashtagCounter, key=outStlHashtagCounter.get, reverse=True) print ("Top Twenty Hashtags Outside Ferguson:") for ht in sortedOutStlHashtags[:20]: print ("\t", "#" + ht, outStlHashtagCounter[ht]) hourlyInterval = {} for t in sortedTimes: newTime = t.replace(second=0, minute=0) currentTimeObject = frequencyMap[t] if ( newTime not in hourlyInterval ): hourlyInterval[newTime] = { "count": currentTimeObject["count"], "list": currentTimeObject["list"] } else: hourlyInterval[newTime]["count"] += currentTimeObject["count"] hourlyInterval[newTime]["list"] = hourlyInterval[newTime]["list"] + currentTimeObject["list"] from IPython.display import display from IPython.display import Image for h in sorted(hourlyInterval.keys()): noRetweets = list(filter(lambda tweet: not tweet["text"].lower().startswith("rt"), hourlyInterval[h]["list"])) tweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], noRetweets)) print (h, hourlyInterval[h]["count"], len(tweetsWithMedia), ) randIndex = np.random.random_integers(0, len(tweetsWithMedia)-1, size=1) imgUrl = tweetsWithMedia[randIndex]["entities"]["media"][0]["media_url"] display(Image(url=imgUrl)) stlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], inStLouisTweets)) print ("Tweets with Media:", len(stlTweetsWithMedia)) for tweet in stlTweetsWithMedia: imgUrl = tweet["entities"]["media"][0]["media_url"] display(Image(url=imgUrl)) outStlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], outStLouisTweets)) print ("Tweets outside St. Louis with Media:", len(outStlTweetsWithMedia)) np.random.shuffle(outStlTweetsWithMedia) for tweet in outStlTweetsWithMedia[:10]: imgUrl = tweet["entities"]["media"][0]["media_url"] display(Image(url=imgUrl)) import re # Read in the sentiment/valence files dataFilePath = os.path.join("data_files", "SentiStrength") valenceFile = os.path.join(dataFilePath, "EmotionLookupTable.txt") emoticonFile = os.path.join(dataFilePath, "EmoticonLookupTable.txt") valenceList = [] # Open the valence file and read in each word/valence pair for line in open(valenceFile, "r"): # Split the line based on tabs and select the first two elements (word, valence) = line.split("\t")[:2] wordRegex = re.compile(word) valencePair = (wordRegex, int(valence)) valenceList.append(valencePair) # Open the emoticon file and read in the valence for each emoticon for line in codecs.open(emoticonFile, "r", "utf-8"): # Split the line based on tabs and select the first two elements (emoticon, valence) = line.split("\t")[:2] emoticonRegex = re.compile(re.escape(emoticon)) valencePair = (emoticonRegex, int(valence)) valenceList.append(valencePair) print ("Number of Sentiment Keywords:", len(valenceList)) # Examples of sentiment pairs for i in np.random.random_integers(0, len(valenceList)-1, 10): print(valenceList[i][0].pattern, "\t", valenceList[i][1]) # Generate sentiment measures for each time timeSentiments = {} for t in sortedTimes: tweetList = frequencyMap[t]["list"] sentimentList = [] thisMinuteSentiment = None for tweet in tweetList: # Calculate the average sentiment for this tweet tweetText = tweet["text"].lower() # skip retweets if ( tweetText.startswith("rt ") ): continue valCount = 0 valSum = 0.0 valAvg = 0.0 for valencePair in valenceList: if ( valencePair[0].search(tweetText) is not None ): valCount += 1 valSum += valencePair[1] if ( valCount > 0 ): valAvg = valSum / valCount sentimentList.append(valAvg) if ( len(sentimentList) > 0 ): thisMinuteSentiment = np.array(sentimentList).mean() else: thisMinuteSentiment = 0.0 timeSentiments[t] = thisMinuteSentiment fig, ax = plt.subplots() fig.set_size_inches(18.5,10.5) plt.title("Sentiment Over Time") postFreqList = [frequencyMap[x]["count"] for x in sortedTimes] sentList = [timeSentiments[x] for x in sortedTimes] smallerXTicks = range(0, len(sortedTimes), 30) plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90) ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts") ax2 = ax.twinx() ax2.plot([0], [0], color="blue", label="Posts") ax2.plot(range(len(frequencyMap)), sentList, color="green", label="Sentiment") ax2.set_ylim(-6,6) ax.grid(b=True, which=u'major') ax2.legend() plt.show() fig, ax = plt.subplots() fig.set_size_inches(18.5,10.5) plt.title("Sentiment Histrogram") for (loc, (tweetList, color)) in {"Inside": (inStLouisTweets, "green"), "Outside": (outStLouisTweets, "blue")}.items(): localSentimentList = [] for tweet in tweetList: # Calculate the average sentiment for this tweet tweetText = tweet["text"].lower() # skip retweets if ( tweetText.startswith("rt ") ): continue valCount = 0 valSum = 0.0 valAvg = 0.0 for valencePair in valenceList: if ( valencePair[0].search(tweetText) is not None ): valCount += 1 valSum += valencePair[1] if ( valCount > 0 ): valAvg = valSum / valCount localSentimentList.append(valAvg) print("Number of Sentiment Tweets:", len(localSentimentList)) ax.hist(localSentimentList, range=(-5, 5), normed=True, alpha=0.5, color=color, label=loc) ax.grid(b=True, which=u'major') ax.legend() plt.show() import gensim.models.ldamodel import gensim.matutils import sklearn.cluster import sklearn.feature_extraction import sklearn.feature_extraction.text import sklearn.metrics import sklearn.preprocessing from nltk.corpus import stopwords enFilter = lambda x: True if x["lang"] == "en" else False # Get all tweets, filter out retweets, save only those in English, and conver to lowercase allTweetList = reduce(lambda x, y: x + y, [frequencyMap[t]["list"] for t in sortedTimes]) noRetweetsList = list(filter(lambda x: not x["text"].lower().startswith("rt"), allTweetList)) onlyEnglishTweets = list(filter(enFilter, noRetweetsList)) lowerTweetText = [x["text"].lower() for x in onlyEnglishTweets] print ("All Tweet Count:", len(allTweetList)) print ("Reduced Tweet Count:", len(lowerTweetText)) enStop = stopwords.words('english') # Skip stop words, retweet signs, @ symbols, and URL headers stopList = enStop + ["http", "https", "rt", "@", ":"] vectorizer = sklearn.feature_extraction.text.CountVectorizer(strip_accents='unicode', tokenizer=None, token_pattern='(?u)#?\\b\\w+[\'-]?\\w+\\b', stop_words=stopList, binary=True) # Create a vectorizer for all our content vectorizer.fit(lowerTweetText) # Get all the words in our text names = vectorizer.get_feature_names() # Create a map for vectorizer IDs to words id2WordDict = dict(zip(range(len(vectorizer.get_feature_names())), names)) # Create a corpus for corpus = vectorizer.transform(lowerTweetText) gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False) # lda = gensim.models.ldamodel.LdaModel(gsCorpus, id2word=id2WordDict, num_topics=10) lda = gensim.models.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=100, passes=2) ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False) topicTokens = [[token for (_,token) in topic] for topic in ldaTopics] for i in range(len(topicTokens)): print ("Topic:", i) for token in topicTokens[i]: print ("\t", token) inStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, inStLouisTweets)] corpus = vectorizer.transform(inStlLowerTweetText) gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False) lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=10, passes=10) ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False) topicTokens = [[token for (_,token) in topic] for topic in ldaTopics] for i in range(len(topicTokens)): print ("Topic:", i) for token in topicTokens[i]: print ("\t", token) outStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, outStLouisTweets)] corpus = vectorizer.transform(outStlLowerTweetText) gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False) lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=50, passes=10) ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False) topicTokens = [[token for (_,token) in topic] for topic in ldaTopics] for i in range(len(topicTokens)): print ("Topic:", i) for token in topicTokens[i]: print ("\t", token) import networkx as nx graph = nx.DiGraph() geoCodedMap = {1: inStLouisTweets, 0: outStLouisTweets} for (location, locationList) in geoCodedMap.items(): print (location, len(locationList)) for tweet in locationList: userName = tweet["user"]["screen_name"] graph.add_node(userName, loc=location) mentionList = tweet["entities"]["user_mentions"] for otherUser in mentionList: otherUserName = otherUser["screen_name"] if ( graph.has_node(otherUserName) == False ): graph.add_node(otherUserName, loc=-1) graph.add_edge(userName, otherUserName) print ("Number of Users:", len(graph.node)) pageRankList = nx.pagerank_numpy(graph) highRankNodes = sorted(pageRankList.keys(), key=pageRankList.get, reverse=True) for x in highRankNodes[:20]: user = api.get_user(x) print (x, pageRankList[x], "\n\t", user.description, "\n----------") print (len(graph.nodes(data=True))) colors = [0.9 if x[1]["loc"] == 1 else 0.1 for x in graph.nodes(data=True)] pos = {x:(np.random.rand(2) * 10) for x in graph.nodes()} nx.draw_networkx_nodes(graph, pos, node_color=colors) nx.draw_networkx_edges(graph, pos) nx.write_graphml(graph, "inVsOutNetwork.graphml", encoding='utf-8', prettyprint=False) # If you want to play with the full graph, # here is code that will build it up for you. # Be careful. It's large. fullGraph = nx.DiGraph() inStlUsers = set(map(lambda x: x["user"]["screen_name"], inStLouisTweets)) outStlUsers = set(map(lambda x: x["user"]["screen_name"], outStLouisTweets)) for (userName, tweetList) in globalUserMap.items(): location = -1 if ( userName in inStlUsers ): location = 1 elif (userName in outStlUsers ): location = 0 fullGraph.add_node(userName, loc=location) for tweet in tweetList: mentionList = tweet["entities"]["user_mentions"] for otherUser in mentionList: otherUserName = otherUser["screen_name"] if ( fullGraph.has_node(otherUserName) == False ): fullGraph.add_node(otherUserName, loc=-1) fullGraph.add_edge(userName, otherUserName) print ("Number of Users:", len(fullGraph.node)) nx.write_graphml(fullGraph, "fullNetwork.graphml", encoding='utf-8', prettyprint=False)