#!/usr/bin/env python # coding: utf-8 # # Analyzing Links using SmappDragon # by [Leon Yin](twitter.com/leonyin)
# 2018-02-16 # # This Tutorial shows how to # 1. Download tweets from Twitter using Tweepy, # 2. Filter and parse tweets using SmappDragon, # 3. Create a link metadata table using SmappDragon, and # 4. Analyze links from questionable websites using Pandas and the OpenSources.co dataset. # # View this on [Github](https://github.com/yinleon/smappdragon-tutorials/blob/master/smappdragon-tutorial-link-analysis.ipynb). # View this on [NBViewer](https://nbviewer.jupyter.org/github/yinleon/smappdragon-tutorials/blob/master/smappdragon-tutorial-link-analysis.ipynb). # Visit my Lab's [website](https://wp.nyu.edu/smapp/) # ## Downloading Tweets with Tweepy # In[1]: # !pip install requirements.txt # In[2]: import os import json import tweepy from smappdragon import JsonCollection # In[3]: # fill these in with your Twitter API credentials, I store them as enviornment variables. consumer_key = os.environ.get('TWEEPY_API_KEY') consumer_secret = os.environ.get('TWEEPY_API_SECRET') access_key = os.environ.get('TWEEPY_ACCESS_TOKEN') access_secret = os.environ.get('TWEEPY_TOKEN_SECRET') # In[4]: auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth, retry_count=2, retry_delay=5, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) # In[5]: screen_name = 'seanhannity' # We can use the tweepy `Cursor` to hit the Twitter API for up to 3.2K tweets per user. # In[6]: user_tweets= [] for tweet in tweepy.Cursor(api.user_timeline, screen_name=screen_name).items(): user_tweets.append(tweet._json) len(user_tweets) # Let's store this data in a new directory. # In[ ]: get_ipython().system('mkdir ./data') # In[ ]: tweet_file = './data/tweets.json' # In[7]: with open(tweet_file, 'w') as f: for tweet in user_tweets: f.write(json.dumps(tweet) + '\n') # We could work with this JSON in a variety of ways.
# At my lab we created a module which works wih JSON records in a `collection` object. # In[8]: collect = JsonCollection(tweet_file, throw_error=0, verbose=1) # In[9]: collect # We access the tweets stored in the `collect` the same way for any generator. # In[10]: collect.get_iterator() # ## What is generator? # A generator is an interator that only keeps track of location.
# In other words, the entirety of the tweet json is not held in memory.
# They are created by functions that _yield_ objects, rather than _return_ objects. # In[11]: def simple_generator_function(): yield 1 yield 2 yield 3 # In[12]: gen = simple_generator_function() # In[13]: gen # We see that this is similar to what is returned from `collect.get_iterator()`.
# We access the values in a generator by iterating through it.
# For loops are the easiest way to iterate. # In[14]: for i in gen: print(i) # Notice, when a generator is iterated through, it is no longer usable. # In[15]: for i in gen: print(i) # When we use the `get_iterator` function, we convert the collection into a generator. # Unlike conventional generators, when do use this function, we can contiue to iterate through the object. # In[16]: for tweet in collect.get_iterator(): print(json.dumps(tweet, indent=2)) break # We're breaking only because we don't want to print all the tweets in our json file. # ## Crunching Numbers # We can study the structure of each tweet, and crunch some numbers.
# For this example let's count who the user is tweeting with? # In[18]: from collections import Counter counter = Counter() for tweet in collect.get_iterator(): for user in tweet['entities']['user_mentions']: counter.update([user['screen_name']]) counter.most_common(10) # We can also created conditional statements to filter the data. # In[19]: def exclude_retweets(tweet): ''' An example of a filter for a smappcollection. Either True or False, the input will always be a json record. ''' if tweet['retweeted'] == True: return False return True # In[20]: collect.set_custom_filter(exclude_retweets) # In[21]: filtered_tweets = [] for tweet in collect.get_iterator(): filtered_tweets.append(tweet) len(filtered_tweets) # We can dump the filtered collection to a compressed csv. # In[22]: filtered_tweet_file = 'tweets_filtered.csv.gz' collect.dump_to_csv(filtered_tweet_file, input_fields = ['user.id', 'text', 'created_at'], compression = 'gzip') # What are the columns available for the `input_fields` argument? # In[23]: def get_all_columns(d, key=[]): ''' A recursive function that traverses json keys. The values return ''' if not isinstance(d, dict): print('.'.join(key)) return for k, v in d.items(): key_path = key + [k] get_all_columns(d[k], key_path) # In[24]: get_all_columns(tweet) # ## Link Analysis # Let's parse out all the links out of the tweet.
# We can't just return the value, as there can be multiple links per Tweet.
# We can solve this by using a generator, and unpacking each using `itertools`. # In[25]: import itertools import requests from urllib.parse import urlparse import pandas as pd # In[26]: def get_link(tweet): ''' Returns a generator containing tweet metadata about media. ''' if not isinstance(tweet, dict): return row = { 'user.id': tweet['user']['id'], 'tweet.id': tweet['id'], 'tweet.created_at': tweet['created_at'], 'tweet.text' : tweet['text'] } list_urls = tweet['entities']['urls'] if list_urls: for url in list_urls: r = row.copy() r['link.url_long'] = url.get('expanded_url') if r['link.url_long']: r['link.domain'] = urlparse(r['link.url_long']).netloc.lower().lstrip('www.') r['link.url_short'] = url.get('url') yield r # In[27]: df_links = pd.DataFrame( list( itertools.chain.from_iterable( [ get_link(tweet) for tweet in collect.get_iterator() if tweet ] ) ) ) # In[28]: df_links.head() # In[29]: # filter out Twitter links df_links = df_links[df_links['link.domain'] != 'twitter.com'] # We can also expand shortened links fron bit.ly # In[31]: def resolve_shortened_link(link): ''' Handles link shorteners like bit.ly. ''' if link['link.domain'] in ['bit.ly']: r = requests.head(link['link.url_long'], allow_redirects=True) return r.url else: return link['link.domain'] # We use the `apply()` function on a Pandas dataframe to apply a function to entire rows (`axis=1`) or columns (`axis=2`) # In[33]: df_links.loc[:, 'link.domain'] = df_links.apply(resolve_shortened_link, axis=1) # In[34]: df_links['link.domain'].value_counts().head(15) # We can see the most common words associated with each link using a simple count sans-stop words. # In[35]: from nltk.corpus import stopwords # What does his own site focus on? # In[36]: word_count = Counter() for sent in df_links[df_links['link.domain'] == 'hannity.com']['tweet.text'].values: word_count.update([w for w in sent.split() if w not in stopwords.words('English')]) word_count.most_common(10) # What about Amazon? # In[37]: word_count = Counter() for sent in df_links[df_links['link.domain'] == 'amzn.to']['tweet.text']: word_count.update([w for w in sent.split() if w not in stopwords.words('English')]) word_count.most_common(10) # ## Questionable Media Domains # We can use the open sources dataset to filter domains on various criteria.
# Here is a notebook that makes the data machine-readible. # In[38]: opensorces_clean_url = 'https://raw.githubusercontent.com/yinleon/fake_news/master/data/sources_clean.tsv' df_os = pd.read_csv(opensorces_clean_url, sep='\t') # In[39]: df_os.head() # In[40]: df_questionable = pd.merge(left= df_links, left_on= 'link.domain', right= df_os, right_on= 'domain', how= 'inner') # What is the breakdown of links shared from questionable sites? # In[41]: df_questionable['link.domain'].value_counts() # We can do some simple matrix math to see the breakdown of quesitonable links # In[42]: # these are the columns we'll base out calculations on. media_classes = [c for c in df_os.columns if c not in ['domain', 'notes']] media_classes # In[43]: get_ipython().run_line_magic('matplotlib', 'inline') # In[44]: breakdown = df_questionable[media_classes].sum(axis=0) breakdown # In[45]: # we'll filter out the non-represented classes, sort them, and plot it! breakdown[breakdown != 0].sort_values().plot( kind='bar', title='Sean Hannity Number of Links per Topic' )