%matplotlib inline
import time
import calendar
import codecs
import datetime
import json
import sys
import gzip
import string
import glob
import os
import numpy as np
if ( sys.version_info.major == 3 ):
from functools import reduce
The first thing we do is read in tweets from a directory of compressed files. Our collection of compressed tweets are in the data_files/twitter directory, so we'll use pattern matching (called "globbing") to find all the tweet files in the given directory.
Then, for each file, we'll open it, read each line (which is a tweet in JSON form), and build an object out of it. As part of this process, we will extract each tweet's post time and create a map from minute timestamps to the tweets posted during that minute.
tweetPath = os.path.join("data_files", "twitter")
tweetFiles = {
"time01": os.path.join(tweetPath, "statuses.*.gz")
}
frequencyMap = {}
globalTweetCounter = 0
timeFormat = "%a %b %d %H:%M:%S +0000 %Y"
reader = codecs.getreader("utf-8")
for (key, path) in tweetFiles.items():
localTweetList = []
for filePath in glob.glob(path):
print ("Reading File:", filePath)
for line in gzip.open(filePath, 'rb'):
# Try to read tweet JSON into object
tweetObj = None
try:
tweetObj = json.loads(reader.decode(line)[0])
except Exception as e:
continue
# Deleted status messages and protected status must be skipped
if ( "delete" in tweetObj.keys() or "status_withheld" in tweetObj.keys() ):
continue
# Try to extract the time of the tweet
try:
currentTime = datetime.datetime.strptime(tweetObj['created_at'], timeFormat)
except:
print (line)
raise
currentTime = currentTime.replace(second=0)
# Increment tweet count
globalTweetCounter += 1
# If our frequency map already has this time, use it, otherwise add
if ( currentTime in frequencyMap.keys() ):
timeMap = frequencyMap[currentTime]
timeMap["count"] += 1
timeMap["list"].append(tweetObj)
else:
frequencyMap[currentTime] = {"count":1, "list":[tweetObj]}
# Fill in any gaps
times = sorted(frequencyMap.keys())
firstTime = times[0]
lastTime = times[-1]
thisTime = firstTime
timeIntervalStep = datetime.timedelta(0, 60) # Time step in seconds
while ( thisTime <= lastTime ):
if ( thisTime not in frequencyMap.keys() ):
frequencyMap[thisTime] = {"count":0, "list":[]}
thisTime = thisTime + timeIntervalStep
print ("Processed Tweet Count:", globalTweetCounter)
Reading File: data_files/twitter/statuses.log.2014-08-13-16.gz Reading File: data_files/twitter/statuses.log.2014-08-13-17.gz Reading File: data_files/twitter/statuses.log.2014-08-13-18.gz Reading File: data_files/twitter/statuses.log.2014-08-13-19.gz Reading File: data_files/twitter/statuses.log.2014-08-13-20.gz Reading File: data_files/twitter/statuses.log.2014-08-13-21.gz Reading File: data_files/twitter/statuses.log.2014-08-13-22.gz Reading File: data_files/twitter/statuses.log.2014-08-13-23.gz Reading File: data_files/twitter/statuses.log.2014-08-14-00.gz Reading File: data_files/twitter/statuses.log.2014-08-14-01.gz Reading File: data_files/twitter/statuses.log.2014-08-14-02.gz Reading File: data_files/twitter/statuses.log.2014-08-14-03.gz Reading File: data_files/twitter/statuses.log.2014-08-14-04.gz Reading File: data_files/twitter/statuses.log.2014-08-14-05.gz Reading File: data_files/twitter/statuses.log.2014-08-14-06.gz Reading File: data_files/twitter/statuses.log.2014-08-14-07.gz Reading File: data_files/twitter/statuses.log.2014-08-14-08.gz Reading File: data_files/twitter/statuses.log.2014-08-14-09.gz Reading File: data_files/twitter/statuses.log.2014-08-14-10.gz Reading File: data_files/twitter/statuses.log.2014-08-14-11.gz Reading File: data_files/twitter/statuses.log.2014-08-14-12.gz Reading File: data_files/twitter/statuses.log.2014-08-14-13.gz Reading File: data_files/twitter/statuses.log.2014-08-14-14.gz Reading File: data_files/twitter/statuses.log.2014-08-14-15.gz Reading File: data_files/twitter/statuses.log.2014-08-14-16.gz Reading File: data_files/twitter/statuses.log.2014-08-14-17.gz Processed Tweet Count: 293560
In this section, we will cover a few simple analysis techniques to garner some small insights rapidly.
To build a timeline of Twitter usage, we can simply plot the number of tweets posted per minute.
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Tweet Frequency")
# Sort the times into an array for future use
sortedTimes = sorted(frequencyMap.keys())
# What time span do these tweets cover?
print ("Time Frame:", sortedTimes[0], sortedTimes[-1])
# Get a count of tweets per minute
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]
# We'll have ticks every thirty minutes (much more clutters the graph)
smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
# Plot the post frequency
ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()
Time Frame: 2014-08-13 16:00:00 2014-08-14 17:44:00
Ferguson was a contentiuous topic, and many people had differing opinions about the issue. Given the volume of tweets we are analyzing, we can now answer who the "loudest" voices were during this time.
That is, who was tweeting the most during this particular time span?
# Create maps for holding counts and tweets for each user
globalUserCounter = {}
globalUserMap = {}
# Iterate through the time stamps
for t in sortedTimes:
timeObj = frequencyMap[t]
# For each tweet, pull the screen name and add it to the list
for tweet in timeObj["list"]:
user = tweet["user"]["screen_name"]
if ( user not in globalUserCounter ):
globalUserCounter[user] = 1
globalUserMap[user] = [tweet]
else:
globalUserCounter[user] += 1
globalUserMap[user].append(tweet)
print ("Unique Users:", len(globalUserCounter.keys()))
Unique Users: 171087
sortedUsers = sorted(globalUserCounter, key=globalUserCounter.get, reverse=True)
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
print (u, globalUserCounter[u], "\n\t", "Random Tweet:", globalUserMap[u][0]["text"], "\n----------")
Top Ten Most Prolific Users: miserablecitytv 94 Random Tweet: RT @AntonioFrench: One pattern of the last few days: After police move the media out of the area, they become more heavy-handed and violent… ---------- FreeGCF 80 Random Tweet: RT @WesleyLowery: Ppl in #Ferguson v sensitive to media descriptions of rioting, so worth noting: only behavior accurately described as suc… ---------- PLNoHope 70 Random Tweet: RT @iAirDry: “@BlkSportsOnline: A story on why what is happening in #Ferguson is a daily fear for many African-Americans http://t.co/0BrHqb… ---------- kingpin7666 66 Random Tweet: RT @HalpernAlex: This is #Ferguson, a suburb in America. http://t.co/GfmHLo4u5q ---------- desperate_jo13 60 Random Tweet: RT @TuxcedoCat: #Ferguson Police Department: Riot Gear ✔️ Tear Gas ✔️ Camouflage ✔️ Assault Rifles ✔️ Armored Land Mine Vehicles ✔️ Dashboa… ---------- TxWomenRock 57 Random Tweet: RT @SoulRevision: Yes & they cover #Ferguson RT @jkendall82: @OwlsAsylum @SoulRevision This is audio from St. Louis dispatch, not Ferguson… ---------- No_Cut_Card 56 Random Tweet: really? RT @kayquinn: #Ferguson police chief: he was upset body of #MikeBrown laid in street so long after shooting. ---------- sierramike320 55 Random Tweet: RT @ChdRountree: America will prove its worth in how it responds to the #MikeBrown tragedy in #Ferguson @OpFerguson ---------- Petapup1 55 Random Tweet: RT @RiseCoffeeSTL: Calling all journalists covering #MikeBrown #Ferguson FREE coffee/Wifi if you need a place to work! @AntonioFrench http:… ---------- TheAPJournalist 53 Random Tweet: RT @jrosenbaum: Media scrum before #Ferguson Police chief conference: http://t.co/3Cwuv4xqPr ----------
It's difficult to see who these people are, but we can go back to the Twitter API and get user descriptions for more information.
import tweepy
consumer_key = "RfWoIb9wocCY0kOYKUYnf5VOo"
consumer_secret = "FqsdZGdD4yvzwPj0yoe7lHRxgG4tjz2WVZbozxpOPnDunMhzv9"
access_token = "2421639553-0IF33x71RsEJL2aKCksu0C1VR8383nqRQK0dYSE"
access_token_secret = "3wSJCvLhgPBi8NUNVWbvosK2DAraGgB9K0NN0URNLVWjs"
# Set up the authorization mechanisms for Tweepy to access Twitter's API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.secure = True
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
print (u, globalUserCounter[u])
# Get user info
try:
user = api.get_user(u)
print ("\tDescription:", user.description)
except Exception as te:
print ("\tDescription Error:", te)
print ("----------")
Top Ten Most Prolific Users: miserablecitytv 94 Description: Aggregator, Agitator & Amplifier - - - - - - - - - - - - - - - - - - - Striving for social justice is the most valuable thing to do in life. - Albert Einstein ---------- FreeGCF 80 Description: @xmariedrewbtrx ---------- PLNoHope 70 Description Error: Twitter error response: status code = 404 ---------- kingpin7666 66 Description: Veteran Of Three wars- Special Operations- #blacklivesmatter- The Future of the Democratic Party- Guilty of Being Black in A White America ---------- desperate_jo13 60 Description: i am jo, and i am desperate for justice. O_O [pronouns: they/them/their] ---------- TxWomenRock 57 Description: I block hellions, imps, & brutes || IF WOMEN CHOOSE THEY CAN MAKE #WENDYDAVIS THE NEXT GOVERNOR OF TEXAS || img by http://t.co/WGShienQMh ---------- No_Cut_Card 56 Description: life enthusiast. disgruntled #Wizards fan. ---------- sierramike320 55 Description: Advocate for the unheard and ignored. Interested in the earth, life, learning, love, and justice. Baseball is Healing. ---------- Petapup1 55 Description: Just me! #Prochoice, MARRIED, #atheist and a Diehard Capitals Fan! nsfw! #lgbtq ---------- TheAPJournalist 53 Description: Co-executive and global news editor at @ReadByline. Word writer, straight talker, political junkie and data enthusiast. (Not affiliated with the @AP) ----------
It appears a few users were posting to Twitter a lot. But how often did most Twitter users tweet during this time? We can build a histogram to see this distribution.
plt.figure(figsize=(16,8))
# the histogram of the data
plt.hist(
[globalUserCounter[x] for x in globalUserCounter],
bins=100,
normed=0,
alpha=0.75,
label="Counts",
log=True)
plt.xlabel('Number of Tweets')
plt.ylabel('Counts')
plt.title("Histogram of Frequency")
plt.grid(True)
plt.legend()
plt.show()
avgPostCount = np.mean([globalUserCounter[x] for x in globalUserCounter])
print("Average Number of Posts:", avgPostCount)
Average Number of Posts: 1.71585216878
Hashtags give us a quick way to view the conversation and see what people are discussing. Getting the most popular hashtags is just as easy as getting the most prolific users.
# A map for hashtag counts
hashtagCounter = {}
# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
timeObj = frequencyMap[t]
for tweet in timeObj["list"]:
hashtagList = tweet["entities"]["hashtags"]
for hashtagObj in hashtagList:
# We lowercase the hashtag to avoid duplicates (e.g., #MikeBrown vs. #mikebrown)
hashtagString = hashtagObj["text"].lower()
if ( hashtagString not in hashtagCounter ):
hashtagCounter[hashtagString] = 1
else:
hashtagCounter[hashtagString] += 1
print ("Unique Hashtags:", len(hashtagCounter.keys()))
sortedHashtags = sorted(hashtagCounter, key=hashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags:")
for ht in sortedHashtags[:20]:
print ("\t", "#" + ht, hashtagCounter[ht])
Unique Hashtags: 6555 Top Twenty Hashtags: #ferguson 209701 #mikebrown 17824 #mediablackout 5322 #gaza 4497 #michaelbrown 2541 #dontshoot 1968 #anonymous 1836 #stl 1607 #palestine 1542 #prayforferguson 1525 #justiceformikebrown 1322 #opferguson 1160 #myawhite 995 #usa 956 #policestate 906 #fergusonshooting 875 #tcot 805 #inners 773 #iraq 736 #fergusonriot 656
Twitter is good for breaking news. When an impactful event occurs, we often see a spike on Twitter of the usage of a related keyword. Some examples are below.
# What keywords are we interested in?
targetKeywords = ["obama", "tear gas"]
# targetKeywords.append("lowery")
# targetKeywords.append("reilly")
targetKeywords.append("iraq")
# Build an empty map for each keyword we are seaching for
targetCounts = {x:[] for x in targetKeywords}
totalCount = []
# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
timeObj = frequencyMap[t]
# Temporary counter for this minute
localTargetCounts = {x:0 for x in targetKeywords}
localTotalCount = 0
for tweetObj in timeObj["list"]:
tweetString = tweetObj["text"].lower()
localTotalCount += 1
# Add to the counter if the target keyword is in this tweet
for keyword in targetKeywords:
if ( keyword in tweetString ):
localTargetCounts[keyword] += 1
# Add the counts for this minute to the main counter
totalCount.append(localTotalCount)
for keyword in targetKeywords:
targetCounts[keyword].append(localTargetCounts[keyword])
# Now plot the total frequency and frequency of each keyword
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Tweet Frequency")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
ax.plot(range(len(frequencyMap)), totalCount, label="Total")
for keyword in targetKeywords:
ax.plot(range(len(frequencyMap)), targetCounts[keyword], label=keyword)
ax.legend()
ax.grid(b=True, which=u'major')
plt.show()
The protests in Ferguson, MO became an international topic of discussion. As a result, people all over the world were tweeting about the events. Using Twitter's data, we can see how many people were tweeting in different languages.
# A map for counting each language
languageCounter = {}
for t in sortedTimes:
timeObj = frequencyMap[t]
for tweet in timeObj["list"]:
lang = tweet["lang"]
if ( lang not in languageCounter ):
languageCounter[lang] = 1
else:
languageCounter[lang] += 1
languages = sorted(languageCounter.keys(), key=languageCounter.get, reverse=True)
for l in languages:
print (l, languageCounter[l])
en 282138 es 3759 und 1882 de 1133 tr 795 fr 623 et 476 sk 463 tl 330 in 306 ar 282 it 241 pt 191 da 160 nl 140 ht 127 pl 81 sl 72 ja 62 sv 57 vi 54 no 51 th 29 ru 20 hu 16 is 14 fa 12 el 10 zh 10 lt 6 lv 5 fi 5 ko 4 hi 2 bg 2 iw 1 iu 1
plt.figure(figsize=(16,8))
# the histogram of the data
plt.bar(
np.arange(len(languages)),
[languageCounter[x] for x in languages],
log=True)
plt.xticks(np.arange(len(languages)) + 0.5, languages)
plt.xlabel('Languages')
plt.ylabel('Counts (Log)')
plt.title("Language Frequency")
plt.grid(True)
plt.show()
Twitter allows users to share their GPS locations when tweeting, but only about 2% of tweets have this information. We can extract this geospatial data to look at patterns in different locations.
In this module, we will look at:
Each tweet has a field called "coordinates" describing from where the tweet was posted. The field might be null if the tweet contains no location data, or it could contain bounding box information, place information, or GPS coordinates in the form of (longitude, latitude). We want tweets with this GPS data.
For more information on tweet JSON formats, check out https://dev.twitter.com/overview/api/tweets
# A frequency map for timestamps to geo-coded tweets
geoFrequencyMap = {}
geoCount = 0
# Save only those tweets with tweet['coordinate']['coordinate'] entity
for t in sortedTimes:
geos = list(filter(lambda tweet: tweet["coordinates"] != None and "coordinates" in tweet["coordinates"], frequencyMap[t]["list"]))
geoCount += len(geos)
# Add to the timestamp map
geoFrequencyMap[t] = {"count": len(geos), "list": geos}
print ("Number of Geo Tweets:", geoCount)
Number of Geo Tweets: 2000
What is the frequency of GPS-coded tweets?
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Geo Tweet Frequency")
postFreqList = [geoFrequencyMap[x]["count"] for x in sortedTimes]
smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=45)
ax.plot(range(len(geoFrequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()
Now that we have a list of all the tweets with GPS coordinates, we can plot from where in the world these tweets were posted. To make this plot, we can leverage the Basemap package to make a map of the world and convert GPS coordinates to (x, y) coordinates we can then plot.
import matplotlib
from mpl_toolkits.basemap import Basemap
# Create a list of all geo-coded tweets
tmpGeoList = [geoFrequencyMap[t]["list"] for t in sortedTimes]
geoTweets = reduce(lambda x, y: x + y, tmpGeoList)
# For each geo-coded tweet, extract its GPS coordinates
geoCoord = [x["coordinates"]["coordinates"] for x in geoTweets]
# Now we build a map of the world using Basemap
land_color = 'lightgray'
water_color = 'lightblue'
fig, ax = plt.subplots(figsize=(24,24))
worldMap = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80,
llcrnrlon=-180, urcrnrlon=180, resolution='l')
worldMap.fillcontinents(color=land_color, lake_color=water_color, zorder=1)
worldMap.drawcoastlines()
worldMap.drawparallels(np.arange(-90.,120.,30.))
worldMap.drawmeridians(np.arange(0.,420.,60.))
worldMap.drawmapboundary(fill_color=water_color, zorder=0)
ax.set_title('World Tweets')
# Convert points from GPS coordinates to (x,y) coordinates
convPoints = [worldMap(p[0], p[1]) for p in geoCoord]
x = [p[0] for p in convPoints]
y = [p[1] for p in convPoints]
worldMap.scatter(x, y, s=100, marker='x', color="red", zorder=2)
plt.show()
We can even use existing Geographic Information System (GIS) tools to determine from where a tweet was posted. For example, we could ask whether a particular tweet was posted from the United States.
To make this determination, we can use geocoding services like Google Maps, or we can use GIS data files called Shape Files, which contain geometric information for a variety of geographic entities (e.g., lakes, roads, county lines, states, countries, etc.).
For our purposes, we pulled a shape file containing the county borders for the state of Missouri, which were sourced from the US Census Department (http://www.census.gov/cgi-bin/geo/shapefiles2010/layers.cgi).
The first step then is to read in this shape file. To divide the Twitter data into those from inside Ferguson, MO and those outside, we found the county containing Ferguson, and we extract the shapes for that county.
# Create a new map to hold the shape file data
stLouisMap = Basemap(llcrnrlon=-130, llcrnrlat=22, urcrnrlon=-64,
urcrnrlat=52, projection='merc', lat_1=33, lat_2=45,
lon_0=-95, resolution='i', area_thresh=10000)
# Read in the shape file
moStateShapeFile = os.path.join("data_files", "moCountyShapes", "tl_2010_29_county10")
shp_info = stLouisMap.readshapefile(moStateShapeFile, 'states', drawbounds=True)
# Find only those polygons that describe St. Louis county
stLouisCountyPolygons = []
for (shapeDict, shape) in zip(stLouisMap.states_info, stLouisMap.states):
if (shapeDict["NAME10"] == "St. Louis"):
stLouisCountyPolygons.append(matplotlib.patches.Polygon(shape))
print ("Shape Count:", len(stLouisCountyPolygons))
Shape Count: 2
For each tweet, we can check whether its GPS coordinates came from St. Louis county or not.
# Maps of timestamps to tweets for inside/outside Ferguson
inStLouisFreqMap = {}
outStLouisFreqMap = {}
# For each geo-coded tweet, extract coordinates and conver them to the Basemap space
for t in sortedTimes:
geos = geoFrequencyMap[t]["list"]
convPoints = [(stLouisMap(tw["coordinates"]["coordinates"][0], tw["coordinates"]["coordinates"][1]), tw) for tw in geos]
# Local counters for this time
inStLouisFreqMap[t] = {"count": 0, "list": []}
outStLouisFreqMap[t] = {"count": 0, "list": []}
# For each point, check if it is within St. Louis county or not
for point in convPoints:
x = point[0][0]
y = point[0][1]
inStLouisFlag = False
for polygon in stLouisCountyPolygons:
if ( polygon.contains_point((x, y)) ):
inStLouisFreqMap[t]["list"].append(point[1])
inStLouisFlag = True
break
if ( inStLouisFlag == False ):
outStLouisFreqMap[t]["list"].append(point[1])
print ("Tweets in St. Louis:", np.sum([len(inStLouisFreqMap[t]["list"]) for t in sortedTimes]))
print ("Tweets outside St. Louis:", np.sum([len(outStLouisFreqMap[t]["list"]) for t in sortedTimes]))
Tweets in St. Louis: 100 Tweets outside St. Louis: 1900
Now that we have divided the data based on those who were tweeting from within Ferguson, MO versus those who were outside, we can identify the most prolific users in each group.
inStLouisTweets = reduce(lambda x, y: x + y, [inStLouisFreqMap[t]["list"] for t in sortedTimes])
userCounter = {}
userMap = {}
for tweet in inStLouisTweets:
user = tweet["user"]["screen_name"]
if ( user not in userCounter ):
userCounter[user] = 1
userMap[user] = [tweet]
else:
userCounter[user] += 1
userMap[user].append(tweet)
print ("Unique Users in St. Louis:", len(userCounter.keys()))
sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True)
Unique Users in St. Louis: 79
print("Top Users in Ferguson:")
for u in sortedUsers[:10]:
print (u, userCounter[u])
# Get user info
try:
user = api.get_user(u)
print ("\t", user.description)
except Exception as te:
print ("\t", te)
print ("\t", userMap[u][0]["text"], "\n----------")
Top Users in Ferguson: TheWidowJones 6 Instant gratification takes too long. “@mattdpearce: Quick writeup of my call to the Ferguson police chief, notifying him of arrests: http://t.co/l3ugLhoZKy” jebus ---------- jst_mani 3 Just a Mizzou Tiger trying to earn her stripes. When you believe in yourself, there's never any room for doubt. If you want something, go get it. Period. http://t.co/4nTUmvrlAq ---------- leisazigman 3 President of The Genome Partnership, a non profit working in the field of genomics Just in: ACLU sends letter to #Ferguson pd concerned they asked vigils/protests just be held during day #MikeBrown ---------- RoadRunnerSTL 3 Follow photojournalist Bobby Hughes as he hits the streets overnight and in the early morning for breaking news around St. Louis. About 100 or more demonstrate outside Ferguson City Hall into the early morning hours . http://t.co/NbaB2YpzNs ---------- CaseyNolen 3 @ksdknews Multimedia Journalist : @theninenetwork Host of #StayTunedSTL http://t.co/fx4hH3lN9N STL Co Pros says "all evidence" will eventually be made public regardless of outcome of investigation. But not while ongoing. #Ferguson ---------- michaelcalhoun 2 I tell stories for @KMOXNews and occasionally for @CBSRadioNews. Ke$ha told me once that she liked my beard. mrcalhoun@cbs.com To restate: protestor with loudspeaker said that when media leave "y'all [in crowd] better watch out" because police will act. #Ferguson ---------- jasonahuff 2 Husband. Dad. Fundraiser. Proud #STL native. Craft beer enthusiast. Tweets are my own. Most major tv & print news in #STL have btwn 20-58K followers. @AntonioFrench has almost 38K. #CitizenJournalists #Ferguson ---------- DochtuirRussell 2 Doctor of Pharmacy. Law Student at SLU. Veteran. Cheering for Blue Jays, Cardinals, Blues and Habs. Lovely destination #Ferguson seems http://t.co/gzuU8iAc3V ---------- ShayMeinecke 2 #journalist Peaceful protest on #Ferguson #vice.com @ Ferguson, MO http://t.co/1lXXkYGcg6 ---------- DonGallowayKSDK 2 Photojournalist, KSDK NewsChannel 5 Church of God & Christ, Ferguson MO helping with the healing. #MichaelBrown #FergusonShooting #ksdknews http://t.co/MahrpsTr6d ----------
outStLouisTweets = reduce(lambda x, y: x + y, [outStLouisFreqMap[t]["list"] for t in sortedTimes])
userCounter = {}
userMap = {}
for tweet in outStLouisTweets:
user = tweet["user"]["screen_name"]
if ( user not in userCounter ):
userCounter[user] = 1
userMap[user] = [tweet]
else:
userCounter[user] += 1
userMap[user].append(tweet)
print ("Unique Users outside St. Louis:", len(userCounter.keys()))
sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True)
Unique Users outside St. Louis: 1689
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
print (u, userCounter[u])
# Get user info
try:
user = api.get_user(u)
print ("\t", user.description)
except Exception as te:
print ("\t", te)
print ("\t", userMap[u][0]["text"], "\n----------")
Top Ten Most Prolific Users: orlando_tina 8 retired model I love my dogs and hunting deer Where is Obama now?# Ferguson ---------- ipimi 8 Politically Progressive Boomer Feminist Geek Physician Writer...And~proud US Census member of The African Diaspora Mid-70sHowardZinnStudent@BU ♓️(3*17'54DCnw) “@HuffPostPol: Ferguson police to meet with Michael Brown's mother http://t.co/t9obrJAayC” #inners ---------- Its_XADR 6 18. 12/11 Designer. Sniper. http://t.co/58xDj308tH @TahhDah @Obey_Rebirthh just click this and see #Ferguson ---------- sametaydoan4 5 unutur mu sevdiğini görmeyince göz #Ferguson crowd threw rocks, bottles, Molotov cocktails, saw a gun, right before this stepped off http://t.co/uKrO1L7kj9 ---------- poetarmone 5 Twitter error response: status code = 404 #ferguson I am pissed at all this foolish Marshall law talk,STOP SAYING IT, STOP LIEING ---------- yummyyo 4 Pause and Pray! If that doesn't work, you didn't do it. The VIP area is a 5-foot radius around me always. I'm fun; have fun with me! http://t.co/aOGje7ZPlP ---------- itsablackguy 4 if u try to roast me or whatever, just remember i been fat for over 20 years. ive heard all the jokes https://t.co/ebOPMbzW2W My coworkers haven't mentioned #Ferguson all week. I'm not surprised though... ---------- KeithJonesJr 4 Public Servant, Campaign Manager, Socialite, Club Promoter, Social Activist for the LGBT movement, future lawyer & MOREHOUSE Man. #RGODC #Leo Kik: KeithJonesJr I see many lawsuits coming. #Ferguson ---------- ronnyshreve 3 Freelance Journalist. Host of the Ron Shreve Show. Reporting on Government corruption, the police state, and crony capitalism ronshrevetips@yahoo.com The police in #Ferguson are criminal scum and need to be arrested and put in prison! #WakeUpNow #tcot ---------- minnman47 3 world affairs in the crosshairs...trigger finger on the pulse of society http://t.co/Bi36ltzFBE ----------
We've already looked at popular hashtags over the course of the day. How does this usage change from inside Ferguson to outside?
inStlHashtagCounter = {}
for tweet in inStLouisTweets:
hashtagList = tweet["entities"]["hashtags"]
for hashtagObj in hashtagList:
hashtagString = hashtagObj["text"].lower()
if ( hashtagString not in inStlHashtagCounter ):
inStlHashtagCounter[hashtagString] = 1
else:
inStlHashtagCounter[hashtagString] += 1
print ("Unique Hashtags in Ferguson:", len(inStlHashtagCounter.keys()))
sortedInStlHashtags = sorted(inStlHashtagCounter, key=inStlHashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags in Ferguson:")
for ht in sortedInStlHashtags[:20]:
print ("\t", "#" + ht, inStlHashtagCounter[ht])
Unique Hashtags in Ferguson: 19 Top Twenty Hashtags in Ferguson: #ferguson 64 #mikebrown 9 #michaelbrown 3 #ksdk 3 #stl 3 #justiceformikebrown 2 #vice 1 #freefrench 1 #icebucketchallenge 1 #neoamerica 1 #moleg 1 #noco 1 #sandyhook 1 #citizenjournalists 1 #rai 1 #ksdknews 1 #barackobama 1 #fergusonshooting 1 #free 1
outStlHashtagCounter = {}
for tweet in outStLouisTweets:
hashtagList = tweet["entities"]["hashtags"]
for hashtagObj in hashtagList:
hashtagString = hashtagObj["text"].lower()
if ( hashtagString not in outStlHashtagCounter ):
outStlHashtagCounter[hashtagString] = 1
else:
outStlHashtagCounter[hashtagString] += 1
print ("Unique Hashtags Outside Ferguson:", len(outStlHashtagCounter.keys()))
sortedOutStlHashtags = sorted(outStlHashtagCounter, key=outStlHashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags Outside Ferguson:")
for ht in sortedOutStlHashtags[:20]:
print ("\t", "#" + ht, outStlHashtagCounter[ht])
Unique Hashtags Outside Ferguson: 273 Top Twenty Hashtags Outside Ferguson: #ferguson 1199 #mikebrown 63 #mediablackout 37 #fergusonriot 18 #prayforferguson 16 #fergusonshooting 15 #gaza 15 #justiceformikebrown 12 #michaelbrown 10 #policestate 9 #inners 9 #dontshoot 9 #policebrutality 7 #tcot 7 #police 6 #ripmikebrown 6 #anonymous 6 #stl 6 #handsupdontshoot 5 #iftheygunnedmedown 4
Twitter is excellent for sharing media, either photographs, movies, or links websites. When you share pictures, Twitter stores them and links to them directly. We can use this data to sample some random pictures taken from each hour of the data we have.
We'll look at:
First, we need to reduce our map of minutes->tweets to hours->tweets.
hourlyInterval = {}
for t in sortedTimes:
newTime = t.replace(second=0, minute=0)
currentTimeObject = frequencyMap[t]
if ( newTime not in hourlyInterval ):
hourlyInterval[newTime] = {
"count": currentTimeObject["count"],
"list": currentTimeObject["list"]
}
else:
hourlyInterval[newTime]["count"] += currentTimeObject["count"]
hourlyInterval[newTime]["list"] = hourlyInterval[newTime]["list"] + currentTimeObject["list"]
Then we filter out retweets and keep only those tweets with a media listing in the "entities" section. Then, we select a random image from the list of pictures for that hour and display it.
from IPython.display import display
from IPython.display import Image
for h in sorted(hourlyInterval.keys()):
noRetweets = list(filter(lambda tweet: not tweet["text"].lower().startswith("rt"), hourlyInterval[h]["list"]))
tweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], noRetweets))
print (h, hourlyInterval[h]["count"], len(tweetsWithMedia), )
randIndex = np.random.random_integers(0, len(tweetsWithMedia)-1, size=1)
imgUrl = tweetsWithMedia[randIndex]["entities"]["media"][0]["media_url"]
display(Image(url=imgUrl))
2014-08-13 16:00:00 1004 20
2014-08-13 17:00:00 2098 34
2014-08-13 18:00:00 1542 31
2014-08-13 19:00:00 1453 28
2014-08-13 20:00:00 2390 48
2014-08-13 21:00:00 2139 67
2014-08-13 22:00:00 2011 57
2014-08-13 23:00:00 5943 149
2014-08-14 00:00:00 10551 166
2014-08-14 01:00:00 16081 196
2014-08-14 02:00:00 30646 318
2014-08-14 03:00:00 32112 416
2014-08-14 04:00:00 26010 330
2014-08-14 05:00:00 18796 288
2014-08-14 06:00:00 17034 269
2014-08-14 07:00:00 18170 278
2014-08-14 08:00:00 9649 173
2014-08-14 09:00:00 4929 83
2014-08-14 10:00:00 4299 87
2014-08-14 11:00:00 5740 112
2014-08-14 12:00:00 7297 130
2014-08-14 13:00:00 11649 226
2014-08-14 14:00:00 15035 244
2014-08-14 15:00:00 15784 304
2014-08-14 16:00:00 17571 324
2014-08-14 17:00:00 13627 218
We can also extract images people tweeted from Ferguson.
stlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], inStLouisTweets))
print ("Tweets with Media:", len(stlTweetsWithMedia))
for tweet in stlTweetsWithMedia:
imgUrl = tweet["entities"]["media"][0]["media_url"]
display(Image(url=imgUrl))
Tweets with Media: 16
Here, we extract 10 random images from outside Ferguson.
outStlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], outStLouisTweets))
print ("Tweets outside St. Louis with Media:", len(outStlTweetsWithMedia))
np.random.shuffle(outStlTweetsWithMedia)
for tweet in outStlTweetsWithMedia[:10]:
imgUrl = tweet["entities"]["media"][0]["media_url"]
display(Image(url=imgUrl))
Tweets outside St. Louis with Media: 188
Another popular type of analysis people do on social networks is "sentiment analysis," which is used to figure out how people feel about a specific topic.
One way to explore sentiment is to use a list of keywords with tagged sentiment information (e.g., "happy" or "awesome" might have high sentiment whereas "terrible" or "awful" might have very low sentiment). Then, we can count the occurrence of these tagged keywords to get a sense of how people feel about the topic at hand.
We use the AFINN Sentiment Dictionary for our keyword list. Link here: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010
import re
# Read in the sentiment/valence files
dataFilePath = os.path.join("data_files", "SentiStrength")
valenceFile = os.path.join(dataFilePath, "EmotionLookupTable.txt")
emoticonFile = os.path.join(dataFilePath, "EmoticonLookupTable.txt")
valenceList = []
# Open the valence file and read in each word/valence pair
for line in open(valenceFile, "r"):
# Split the line based on tabs and select the first two elements
(word, valence) = line.split("\t")[:2]
wordRegex = re.compile(word)
valencePair = (wordRegex, int(valence))
valenceList.append(valencePair)
# Open the emoticon file and read in the valence for each emoticon
for line in codecs.open(emoticonFile, "r", "utf-8"):
# Split the line based on tabs and select the first two elements
(emoticon, valence) = line.split("\t")[:2]
emoticonRegex = re.compile(re.escape(emoticon))
valencePair = (emoticonRegex, int(valence))
valenceList.append(valencePair)
print ("Number of Sentiment Keywords:", len(valenceList))
Number of Sentiment Keywords: 2659
# Examples of sentiment pairs
for i in np.random.random_integers(0, len(valenceList)-1, 10):
print(valenceList[i][0].pattern, "\t", valenceList[i][1])
drown[a-z]* -2 desirable 4 intolleran[a-z]* -3 hopefully 1 plays 2 pained -2 dirt -2 intimidat[a-z]* -4 XP 1 disagre[a-z]* -2
# Generate sentiment measures for each time
timeSentiments = {}
for t in sortedTimes:
tweetList = frequencyMap[t]["list"]
sentimentList = []
thisMinuteSentiment = None
for tweet in tweetList:
# Calculate the average sentiment for this tweet
tweetText = tweet["text"].lower()
# skip retweets
if ( tweetText.startswith("rt ") ):
continue
valCount = 0
valSum = 0.0
valAvg = 0.0
for valencePair in valenceList:
if ( valencePair[0].search(tweetText) is not None ):
valCount += 1
valSum += valencePair[1]
if ( valCount > 0 ):
valAvg = valSum / valCount
sentimentList.append(valAvg)
if ( len(sentimentList) > 0 ):
thisMinuteSentiment = np.array(sentimentList).mean()
else:
thisMinuteSentiment = 0.0
timeSentiments[t] = thisMinuteSentiment
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Sentiment Over Time")
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]
sentList = [timeSentiments[x] for x in sortedTimes]
smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)
ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax2 = ax.twinx()
ax2.plot([0], [0], color="blue", label="Posts")
ax2.plot(range(len(frequencyMap)), sentList, color="green", label="Sentiment")
ax2.set_ylim(-6,6)
ax.grid(b=True, which=u'major')
ax2.legend()
plt.show()
Based on this data, we can see that most people are pretty unhappy with the events in Ferguson, MO. This result is not all that unexpected.
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)
plt.title("Sentiment Histrogram")
for (loc, (tweetList, color)) in {"Inside": (inStLouisTweets, "green"), "Outside": (outStLouisTweets, "blue")}.items():
localSentimentList = []
for tweet in tweetList:
# Calculate the average sentiment for this tweet
tweetText = tweet["text"].lower()
# skip retweets
if ( tweetText.startswith("rt ") ):
continue
valCount = 0
valSum = 0.0
valAvg = 0.0
for valencePair in valenceList:
if ( valencePair[0].search(tweetText) is not None ):
valCount += 1
valSum += valencePair[1]
if ( valCount > 0 ):
valAvg = valSum / valCount
localSentimentList.append(valAvg)
print("Number of Sentiment Tweets:", len(localSentimentList))
ax.hist(localSentimentList, range=(-5, 5), normed=True, alpha=0.5, color=color, label=loc)
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()
Number of Sentiment Tweets: 88 Number of Sentiment Tweets: 1799
Along with sentiment analysis, a question often asked of social networks is "What are people talking about?" We can answer this question using tools from topic modeling and natural language processing, and we can even divide this data to see what people in Ferguson are talking about versus those outside.
To generate these topic models, we will use the Gensim package's implementation of Latent Dirichlet Allocation (LDA), which basically constructs a set of topics where each topic is described as a probability distribution over the words in our tweets. Several other methods for topic modeling exist as well.
import gensim.models.ldamodel
import gensim.matutils
import sklearn.cluster
import sklearn.feature_extraction
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing
from nltk.corpus import stopwords
We first extract the text of all English tweets that are not retweets and make the text lowercase.
enFilter = lambda x: True if x["lang"] == "en" else False
# Get all tweets, filter out retweets, save only those in English, and conver to lowercase
allTweetList = reduce(lambda x, y: x + y, [frequencyMap[t]["list"] for t in sortedTimes])
noRetweetsList = list(filter(lambda x: not x["text"].lower().startswith("rt"), allTweetList))
onlyEnglishTweets = list(filter(enFilter, noRetweetsList))
lowerTweetText = [x["text"].lower() for x in onlyEnglishTweets]
print ("All Tweet Count:", len(allTweetList))
print ("Reduced Tweet Count:", len(lowerTweetText))
All Tweet Count: 293560 Reduced Tweet Count: 57121
Now we build a list of stop words (words we don't care about) and build a feature generator (the vectorizer) that assigns integer keys to tokens and counts the number of each token.
enStop = stopwords.words('english')
# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = enStop + ["http", "https", "rt", "@", ":"]
vectorizer = sklearn.feature_extraction.text.CountVectorizer(strip_accents='unicode',
tokenizer=None,
token_pattern='(?u)#?\\b\\w+[\'-]?\\w+\\b',
stop_words=stopList,
binary=True)
# Create a vectorizer for all our content
vectorizer.fit(lowerTweetText)
# Get all the words in our text
names = vectorizer.get_feature_names()
# Create a map for vectorizer IDs to words
id2WordDict = dict(zip(range(len(vectorizer.get_feature_names())), names))
We then use the vectorizer to transform our tweet text into a feature set, which essentially is a table with rows of tweets, columns for each keyword, and each cell is the number of times that keyword appears in that tweet.
We then convert that table into a model the Gensim package can handle, apply LDA, and grab the top 10 topics, 10 words that describe that topic, and print them.
# Create a corpus for
corpus = vectorizer.transform(lowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
# lda = gensim.models.ldamodel.LdaModel(gsCorpus, id2word=id2WordDict, num_topics=10)
lda = gensim.models.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=100, passes=2)
ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
print ("Topic:", i)
for token in topicTokens[i]:
print ("\t", token)
Topic: 0 believe #ferguson can't ferguson hard images police terrifying co going Topic: 1 #ferguson investigate co step ferguson five line fair police #palestine Topic: 2 co bullets rubber police ferguson clash gas tear #ferguson protesters Topic: 3 #ferguson co act military police ferguson like shows become #missouri Topic: 4 disney ferguson channel #ferguson bc y'all playing shows people days Topic: 5 ferguson people #ferguson officials mad pres side videos obama distract Topic: 6 brown co ferguson live watch mike michael mo livestream shooting Topic: 7 ferguson #ferguson peace pray thank god people idea heart living Topic: 8 #ferguson police cameras co turn ones ferguson ridiculous shouldn't animals Topic: 9 rights #ferguson civil ferguson amendment different co tone 1st movement
We do the same thing with only those tweets in Ferguson to find topics people are discussing there.
inStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, inStLouisTweets)]
corpus = vectorizer.transform(inStlLowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=10, passes=10)
ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
print ("Topic:", i)
for token in topicTokens[i]:
print ("\t", token)
Topic: 0 #ferguson co ferguson #mikebrown peaceful police stand arrested antoniofrench different Topic: 1 ferguson police like #ferguson situation actions co obama seems media Topic: 2 #ferguson ferguson #mikebrown outside much sen mccaskill got protests freedom Topic: 3 #ferguson police report co protest time wants conversation one media Topic: 4 man ferguson right chief antoniofrench news justice twitter conference wrapped Topic: 5 right cops around watching citizens still can't covering 20 miles Topic: 6 #ferguson co police ferguson simply i'm it's way tweets tear Topic: 7 ferguson #ferguson co mo shot i'm going chief mattdpearce amp Topic: 8 ferguson #ferguson co city outside people crowds 100 hall i'm Topic: 9 #ferguson co situation police stl says behind something like well
outStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, outStLouisTweets)]
corpus = vectorizer.transform(outStlLowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=50, passes=10)
ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
print ("Topic:", i)
for token in topicTokens[i]:
print ("\t", token)
Topic: 0 #ferguson co ferguson amp unarmed shooting arrested live reporters police Topic: 1 #ferguson co ferguson don't team #mikebrown want go johnlegend never Topic: 2 #ferguson channel ferguson disney worse instead twitter went pretty world Topic: 3 ferguson #ferguson y'all hood story bring way black disney don't Topic: 4 #ferguson ferguson amp it's that's officer police i'm come state Topic: 5 #ferguson co ferguson happened going police #fergusonriot damn missouri time Topic: 6 #ferguson co police ferguson news ridiculous obama heart watching it's Topic: 7 ferguson #ferguson police shit know going people co cops even Topic: 8 #ferguson co real police ferguson crews leave area asked news Topic: 9 #ferguson police co chief moment ferguson #fergusonriot using one events
Issues of race, class, poverty, and police militarization all came out during the protests and clashes with law enforcement, and it didn't take much to find people on either side of each issue on Twitter. At the same time, people were turning to Twitter for news about the events on the ground since many perceived that mainstream media wasn't giving the events adequate or fair coverage. Using network analysis, we can get some idea about who the most important Twitter users were during this time, and how people split into groups online.
For this analysis, we'll use the NetworkX package to construct a social graph of how people interact. Each person in our Twitter data will be a node in our graph, and edges in the graph will represent mentions during this timeframe. Then we will explore a few simple analytical methods in network analysis, including:
To limit the amount of data we're looking at, we'll only build the network for people who have GPS locations in their tweets and the people they mention. We build this network simply by iterating through all the tweets in our GPS list and extract the "user_mentions" list from the "entities" section of the tweet object. For each mention a user makes, we will add an edge from that user to the user he/she mentioned.
In addition, we will append a location attribute to each user based on whether we saw them in Ferguson or outside of Ferguson.
import networkx as nx
graph = nx.DiGraph()
geoCodedMap = {1: inStLouisTweets, 0: outStLouisTweets}
for (location, locationList) in geoCodedMap.items():
print (location, len(locationList))
for tweet in locationList:
userName = tweet["user"]["screen_name"]
graph.add_node(userName, loc=location)
mentionList = tweet["entities"]["user_mentions"]
for otherUser in mentionList:
otherUserName = otherUser["screen_name"]
if ( graph.has_node(otherUserName) == False ):
graph.add_node(otherUserName, loc=-1)
graph.add_edge(userName, otherUserName)
print ("Number of Users:", len(graph.node))
0 1900 1 100 Number of Users: 2345
In network analysis, "centrality" is used to measure the importance of a given node. Many different types of centrality are used to describe various types of importance though. Examples include "closeness centrality," which measures how close a node is to all other nodes in the network, versus "betweeness centrality," which measures how many shortest paths run through the given node. Nodes with high closeness centrality are important for rapidly disseminating information or spreading disease, whereas nodes with high betweeness are more important to ensure the network stays connected.
The PageRank is another algorithm for measuring importance and was proposed by Sergey Brin and Larry Page for the early version of Google's search algorithm. NetworkX has an implementation of the PageRank algorithm that we can use to look at the most important/authoritative users on Twitter based on their connections to other users.
pageRankList = nx.pagerank_numpy(graph)
highRankNodes = sorted(pageRankList.keys(), key=pageRankList.get, reverse=True)
for x in highRankNodes[:20]:
user = api.get_user(x)
print (x, pageRankList[x], "\n\t", user.description, "\n----------")
BarackObama 0.008778991145676325 This account is run by Organizing for Action staff. Tweets from the President are signed -bo. ---------- WesleyLowery 0.0063034776506989 seek truth :: give voice to the voiceless :: shine light in the darkness ---------- AntonioFrench 0.004588161213234277 Dad, Husband, Alderman of the @21stWard in St. Louis, founder of @North_Campus, Auburn grad and devoted Auburn Football fan, @WUSTL MBA. ---------- CNN 0.0027568741839318553 It’s our job to #GoThere and tell the most difficult stories. Come with us! ---------- TheAnonMessage 0.002739265654803952 Coming soon. ---------- GovJayNixon 0.0027146137140248924 Official Twitter feed of Missouri Governor Jay Nixon ---------- ThoughtCatalog 0.0025173981877923404 All thinking is relevant. ---------- YourAnonNews 0.0024680943062341918 Signal boost for Anonymous operations, resistance movements, & journalism. #ShutItDown ---------- JavoPerez_ 0.0023201826615596288 Employed | IE | Tread Lightly | IG: Javoperez_ ---------- JohnnyLoud_Pack 0.002320182661559628 Living this #SagLife as a #Dreadhead... I Aint Shit Yet!!! Cleveland 216 #footlocker #towercity ---------- DLRMiller 0.0023201826615596253 I like drawing, painting, and chocolate. ---------- washingtonpost 0.0022462268392225634 Tweets about everything from breaking news to bad restaurants. Sometimes global, sometimes local. Led by @hermanywong and @MiGold. ---------- maddow 0.00219692295766441 I see political people... (Retweets do not imply endorsement.) ---------- FoxNews 0.0021476190761062636 America’s Strongest Primetime Lineup Anywhere! Follow America's #1 cable news network, delivering you breaking news, insightful analysis, and must-see videos. ---------- msnbc 0.001920949282968822 The place for in-depth analysis, political commentary and informed perspectives. Have questions? Tweet to @Farrashley, @NishaChittal and @digimuller. ---------- chrislhayes 0.0016075174644920428 Host of All In with Chris Hayes on MSNBC, Weeknights at 8pm. Editor at Large at The Nation. Cubs fan. ---------- ryanjreilly 0.0015806244381876187 @HuffingtonPost Justice Reporter on #DOJ, #SCOTUS, #Guantanamo, #Ferguson et al. Previously: @TPM, @MainJustice 202-527-9261 ryan.reilly@huffingtonpost.com ---------- ksdknews 0.0014425735698248158 Where the News Comes First for breaking news, weather and sports both on-air and online. Retweets are not endorsements. #STLTogether ---------- jonswaine 0.001408060852734104 Senior reporter for @GuardianUS jon.swaine@theguardian.com ---------- natedrug 0.0013834089119550623 dead inside. ----------
A lot of information can be gleaned from visualizing how these networks interact. In Python, we can plot these networks relatively easily.
print (len(graph.nodes(data=True)))
colors = [0.9 if x[1]["loc"] == 1 else 0.1 for x in graph.nodes(data=True)]
pos = {x:(np.random.rand(2) * 10) for x in graph.nodes()}
nx.draw_networkx_nodes(graph, pos, node_color=colors)
nx.draw_networkx_edges(graph, pos)
2345
<matplotlib.collections.LineCollection at 0x2e2c9bb38>
This graph is relatively uninformative, so we will turn to other tools for better visualization.
We first save this graph to a file, so we can import into other tools.
nx.write_graphml(graph, "inVsOutNetwork.graphml", encoding='utf-8', prettyprint=False)
# If you want to play with the full graph,
# here is code that will build it up for you.
# Be careful. It's large.
fullGraph = nx.DiGraph()
inStlUsers = set(map(lambda x: x["user"]["screen_name"], inStLouisTweets))
outStlUsers = set(map(lambda x: x["user"]["screen_name"], outStLouisTweets))
for (userName, tweetList) in globalUserMap.items():
location = -1
if ( userName in inStlUsers ):
location = 1
elif (userName in outStlUsers ):
location = 0
fullGraph.add_node(userName, loc=location)
for tweet in tweetList:
mentionList = tweet["entities"]["user_mentions"]
for otherUser in mentionList:
otherUserName = otherUser["screen_name"]
if ( fullGraph.has_node(otherUserName) == False ):
fullGraph.add_node(otherUserName, loc=-1)
fullGraph.add_edge(userName, otherUserName)
print ("Number of Users:", len(fullGraph.node))
nx.write_graphml(fullGraph, "fullNetwork.graphml", encoding='utf-8', prettyprint=False)