#!/usr/bin/env python
# coding: utf-8
# # Analyzing Links using SmappDragon
# by [Leon Yin](twitter.com/leonyin)
# 2018-02-16
#
# This Tutorial shows how to
# 1. Download tweets from Twitter using Tweepy,
# 2. Filter and parse tweets using SmappDragon,
# 3. Create a link metadata table using SmappDragon, and
# 4. Analyze links from questionable websites using Pandas and the OpenSources.co dataset.
#
# View this on [Github](https://github.com/yinleon/smappdragon-tutorials/blob/master/smappdragon-tutorial-link-analysis.ipynb).
# View this on [NBViewer](https://nbviewer.jupyter.org/github/yinleon/smappdragon-tutorials/blob/master/smappdragon-tutorial-link-analysis.ipynb).
# Visit my Lab's [website](https://wp.nyu.edu/smapp/)
# ## Downloading Tweets with Tweepy
# In[1]:
# !pip install requirements.txt
# In[2]:
import os
import json
import tweepy
from smappdragon import JsonCollection
# In[3]:
# fill these in with your Twitter API credentials, I store them as enviornment variables.
consumer_key = os.environ.get('TWEEPY_API_KEY')
consumer_secret = os.environ.get('TWEEPY_API_SECRET')
access_key = os.environ.get('TWEEPY_ACCESS_TOKEN')
access_secret = os.environ.get('TWEEPY_TOKEN_SECRET')
# In[4]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth, retry_count=2, retry_delay=5,
wait_on_rate_limit=True,
wait_on_rate_limit_notify=True)
# In[5]:
screen_name = 'seanhannity'
# We can use the tweepy `Cursor` to hit the Twitter API for up to 3.2K tweets per user.
# In[6]:
user_tweets= []
for tweet in tweepy.Cursor(api.user_timeline, screen_name=screen_name).items():
user_tweets.append(tweet._json)
len(user_tweets)
# Let's store this data in a new directory.
# In[ ]:
get_ipython().system('mkdir ./data')
# In[ ]:
tweet_file = './data/tweets.json'
# In[7]:
with open(tweet_file, 'w') as f:
for tweet in user_tweets:
f.write(json.dumps(tweet) + '\n')
# We could work with this JSON in a variety of ways.
# At my lab we created a module which works wih JSON records in a `collection` object.
# In[8]:
collect = JsonCollection(tweet_file, throw_error=0, verbose=1)
# In[9]:
collect
# We access the tweets stored in the `collect` the same way for any generator.
# In[10]:
collect.get_iterator()
# ## What is generator?
# A generator is an interator that only keeps track of location.
# In other words, the entirety of the tweet json is not held in memory.
# They are created by functions that _yield_ objects, rather than _return_ objects.
# In[11]:
def simple_generator_function():
yield 1
yield 2
yield 3
# In[12]:
gen = simple_generator_function()
# In[13]:
gen
# We see that this is similar to what is returned from `collect.get_iterator()`.
# We access the values in a generator by iterating through it.
# For loops are the easiest way to iterate.
# In[14]:
for i in gen:
print(i)
# Notice, when a generator is iterated through, it is no longer usable.
# In[15]:
for i in gen:
print(i)
# When we use the `get_iterator` function, we convert the collection into a generator.
# Unlike conventional generators, when do use this function, we can contiue to iterate through the object.
# In[16]:
for tweet in collect.get_iterator():
print(json.dumps(tweet, indent=2))
break
# We're breaking only because we don't want to print all the tweets in our json file.
# ## Crunching Numbers
# We can study the structure of each tweet, and crunch some numbers.
# For this example let's count who the user is tweeting with?
# In[18]:
from collections import Counter
counter = Counter()
for tweet in collect.get_iterator():
for user in tweet['entities']['user_mentions']:
counter.update([user['screen_name']])
counter.most_common(10)
# We can also created conditional statements to filter the data.
# In[19]:
def exclude_retweets(tweet):
'''
An example of a filter for a smappcollection.
Either True or False, the input will always be a json record.
'''
if tweet['retweeted'] == True:
return False
return True
# In[20]:
collect.set_custom_filter(exclude_retweets)
# In[21]:
filtered_tweets = []
for tweet in collect.get_iterator():
filtered_tweets.append(tweet)
len(filtered_tweets)
# We can dump the filtered collection to a compressed csv.
# In[22]:
filtered_tweet_file = 'tweets_filtered.csv.gz'
collect.dump_to_csv(filtered_tweet_file,
input_fields = ['user.id', 'text', 'created_at'],
compression = 'gzip')
# What are the columns available for the `input_fields` argument?
# In[23]:
def get_all_columns(d, key=[]):
'''
A recursive function that traverses json keys.
The values return
'''
if not isinstance(d, dict):
print('.'.join(key))
return
for k, v in d.items():
key_path = key + [k]
get_all_columns(d[k], key_path)
# In[24]:
get_all_columns(tweet)
# ## Link Analysis
# Let's parse out all the links out of the tweet.
# We can't just return the value, as there can be multiple links per Tweet.
# We can solve this by using a generator, and unpacking each using `itertools`.
# In[25]:
import itertools
import requests
from urllib.parse import urlparse
import pandas as pd
# In[26]:
def get_link(tweet):
'''
Returns a generator containing tweet metadata about media.
'''
if not isinstance(tweet, dict):
return
row = {
'user.id': tweet['user']['id'],
'tweet.id': tweet['id'],
'tweet.created_at': tweet['created_at'],
'tweet.text' : tweet['text']
}
list_urls = tweet['entities']['urls']
if list_urls:
for url in list_urls:
r = row.copy()
r['link.url_long'] = url.get('expanded_url')
if r['link.url_long']:
r['link.domain'] = urlparse(r['link.url_long']).netloc.lower().lstrip('www.')
r['link.url_short'] = url.get('url')
yield r
# In[27]:
df_links = pd.DataFrame(
list(
itertools.chain.from_iterable(
[ get_link(tweet) for tweet in collect.get_iterator() if tweet ]
)
)
)
# In[28]:
df_links.head()
# In[29]:
# filter out Twitter links
df_links = df_links[df_links['link.domain'] != 'twitter.com']
# We can also expand shortened links fron bit.ly
# In[31]:
def resolve_shortened_link(link):
'''
Handles link shorteners like bit.ly.
'''
if link['link.domain'] in ['bit.ly']:
r = requests.head(link['link.url_long'], allow_redirects=True)
return r.url
else:
return link['link.domain']
# We use the `apply()` function on a Pandas dataframe to apply a function to entire rows (`axis=1`) or columns (`axis=2`)
# In[33]:
df_links.loc[:, 'link.domain'] = df_links.apply(resolve_shortened_link, axis=1)
# In[34]:
df_links['link.domain'].value_counts().head(15)
# We can see the most common words associated with each link using a simple count sans-stop words.
# In[35]:
from nltk.corpus import stopwords
# What does his own site focus on?
# In[36]:
word_count = Counter()
for sent in df_links[df_links['link.domain'] == 'hannity.com']['tweet.text'].values:
word_count.update([w for w in sent.split() if w not in stopwords.words('English')])
word_count.most_common(10)
# What about Amazon?
# In[37]:
word_count = Counter()
for sent in df_links[df_links['link.domain'] == 'amzn.to']['tweet.text']:
word_count.update([w for w in sent.split() if w not in stopwords.words('English')])
word_count.most_common(10)
# ## Questionable Media Domains
# We can use the open sources dataset to filter domains on various criteria.
# Here is a notebook that makes the data machine-readible.
# In[38]:
opensorces_clean_url = 'https://raw.githubusercontent.com/yinleon/fake_news/master/data/sources_clean.tsv'
df_os = pd.read_csv(opensorces_clean_url, sep='\t')
# In[39]:
df_os.head()
# In[40]:
df_questionable = pd.merge(left= df_links, left_on= 'link.domain',
right= df_os, right_on= 'domain', how= 'inner')
# What is the breakdown of links shared from questionable sites?
# In[41]:
df_questionable['link.domain'].value_counts()
# We can do some simple matrix math to see the breakdown of quesitonable links
# In[42]:
# these are the columns we'll base out calculations on.
media_classes = [c for c in df_os.columns if c not in ['domain', 'notes']]
media_classes
# In[43]:
get_ipython().run_line_magic('matplotlib', 'inline')
# In[44]:
breakdown = df_questionable[media_classes].sum(axis=0)
breakdown
# In[45]:
# we'll filter out the non-represented classes, sort them, and plot it!
breakdown[breakdown != 0].sort_values().plot(
kind='bar', title='Sean Hannity Number of Links per Topic'
)