This notebook shows how to use the Twitter API for collecting data on twitter. We show how to use the Tweepy library for collecting data, how to restrict search results and perform some basic visualisation and filtering of the collected data. This notebook is structured as follow:
# Load .env file if it exists:
%load_ext dotenv
%dotenv api.env
# Import required libraries:
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import requests
import folium
import os
import tweepy
import uuid
import json
import pandas as pd
import numpy as np
from geopandas import GeoDataFrame
from shapely.geometry import Point
Twitter and other social media platforms use a form of authentication called 'OAuth'. There are different types of authentification. In order to colect data, you need a Twitter account and generate four different access token (see the API key generation tutorial):
# Add your personal tokens below:
CONSUMER_KEY = '<YOUR_CONSUMER_KEY>'
CONSUMER_SECRET = '<YOUR_CONSUMER_SECRET>'
ACCESS_TOKEN = '<YOUR_ACCESS_TOKEN>'
TOKEN_SECRET = '<YOUR_TOKEN_SECRET>'
# Load environment variables instead if they exists:
CONSUMER_KEY = os.environ.get('CONSUMER_KEY', CONSUMER_KEY)
CONSUMER_SECRET = os.environ.get('CONSUMER_SECRET', CONSUMER_SECRET)
ACCESS_TOKEN = os.environ.get('ACCESS_TOKEN', ACCESS_TOKEN)
TOKEN_SECRET = os.environ.get('TOKEN_SECRET', TOKEN_SECRET)
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, TOKEN_SECRET)
api = tweepy.API(auth)
First we obtain some data using the basic Twitter seach API. Tweepy manages rate limits and other APIs limitations. In order to find relevant tweets, we can use query parameters such as keywords or geolocation.
# Set some simple earthquake related keywords:
keywords = ['earthquake', 'quake', 'magnitude', 'epicenter', 'magnitude', 'aftershock']
# Collect 100 tweets using the keywords:
search_results = api.search(q=' OR '.join(keywords), count=100)
Each Twitter message is encoded a JSON and contains multiple information besides its author and text:
if search_results is not []:
# Print the first Tweet as JSON:
print(json.dumps(search_results[0]._json, indent=2))
{ "geo": null, "favorite_count": 0, "id_str": "988021502216843264", "metadata": { "result_type": "recent", "iso_language_code": "ja" }, "id": 988021502216843264, "entities": { "symbols": [], "hashtags": [], "user_mentions": [ { "screen_name": "ngcmk005", "name": "\u30d5\u30b8\u30e4\u30de \u3066\u308b\u516c", "indices": [ 3, 12 ], "id_str": "571598246", "id": 571598246 } ], "urls": [] }, "retweeted_status": { "geo": null, "entities": { "symbols": [], "hashtags": [], "user_mentions": [], "urls": [ { "indices": [ 117, 140 ], "expanded_url": "https://twitter.com/i/web/status/987898117780525056", "url": "https://t.co/pzMCOqiyAf", "display_url": "twitter.com/i/web/status/9\u2026" } ] }, "id_str": "987898117780525056", "metadata": { "result_type": "recent", "iso_language_code": "ja" }, "id": 987898117780525056, "favorite_count": 1, "truncated": true, "text": "\u3010\u5730\u9707\u72b6\u6cc1\u3011 4\u670822\u65e5 \u6c17\u8c61\u5e81\u767a\u8868\u306e\u5730\u9707\u306f\u300112\u6642\u307e\u30672\u56de\u3067\u3059\u3002\u203b22\u65e5 \u5348\u524d\u4e2d\u307e\u3067\u306e\u6709\u611f\u5730\u9707\u306f\u5c11\u306a\u304f\u306a\u3063\u3066\u3044\u308b\u3002\u5c0f\u7b20\u539f\u8af8\u5cf6\u306e\u7236\u5cf6\u8fd1\u6d77\u3067M4.9\u3001\u9707\u5ea62\u3068\u9752\u68ee\u770c\u6771\u65b9\u6c96\u3067M3.1\u3001\u9707\u5ea61\u3000\u6628\u65e5\u304b\u3089\u5730\u9707\u304c\u5c11\u306a\u3044\u4e2d\u3001M5.0\u3084M4.9\u2026 https://t.co/pzMCOqiyAf", "source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>", "user": { "entities": { "description": { "urls": [] } }, "friends_count": 2, "id_str": "571598246", "url": null, "default_profile": true, "is_translator": false, "id": 571598246, "description": "\u3053\u3061\u3089\u306f\u3001\u30d5\u30b8\u30e4\u30de \u3066\u308b\u516c(\u30df\u30b9\u30bf\u30fc\u30fbT)\u3067\u3059\u3002 \u5c1a\u3001\u500b\u4eba\u653e\u9001\u5c40\u306eminiFMT&T879\u3092\u958b\u5c40\u3057\u3066\u3044\u307e\u3059\u3002\u30a4\u30f3\u30bf\u30fc\u30cd\u30c3\u30c8\u3092\u4e2d\u5fc3\u306b\u30e9\u30a4\u30d6\u653e\u9001\u3092\u914d\u4fe1\u3057\u3066\u3044\u308b\u3002\u66f8\u304d\u8fbc\u307f\u306f\u5730\u9707\u95a2\u4fc2\u3084\u6c17\u8c61\u95a2\u4fc2\u306e\u60c5\u5831\u3092\u4e3b\u306b\u4f1d\u3048\u3066\u3044\u307e\u3059\u306e\u3067\u3001\u5b9c\u3057\u304f\u305a\u3089!", "profile_text_color": "333333", "notifications": false, "profile_background_tile": false, "profile_sidebar_fill_color": "DDEEF6", "following": false, "geo_enabled": false, "profile_use_background_image": true, "favourites_count": 20, "profile_image_url": "http://pbs.twimg.com/profile_images/478765435089727488/dWLHaRZ-_normal.jpeg", "is_translation_enabled": false, "follow_request_sent": false, "profile_link_color": "1DA1F2", "contributors_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/571598246/1413476307", "name": "\u30d5\u30b8\u30e4\u30de \u3066\u308b\u516c", "profile_sidebar_border_color": "C0DEED", "time_zone": null, "protected": false, "statuses_count": 16391, "profile_background_color": "C0DEED", "profile_image_url_https": "https://pbs.twimg.com/profile_images/478765435089727488/dWLHaRZ-_normal.jpeg", "verified": false, "listed_count": 11, "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "default_profile_image": false, "lang": "ja", "translator_type": "none", "has_extended_profile": false, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "screen_name": "ngcmk005", "location": "\u9759\u5ca1\u770c", "followers_count": 166, "utc_offset": null, "created_at": "Sat May 05 09:19:02 +0000 2012" }, "place": null, "in_reply_to_screen_name": null, "possibly_sensitive": false, "coordinates": null, "contributors": null, "is_quote_status": false, "retweeted": false, "favorited": false, "in_reply_to_status_id": null, "lang": "ja", "in_reply_to_user_id": null, "in_reply_to_status_id_str": null, "retweet_count": 1, "created_at": "Sun Apr 22 03:37:20 +0000 2018", "in_reply_to_user_id_str": null }, "truncated": false, "text": "RT @ngcmk005: \u3010\u5730\u9707\u72b6\u6cc1\u3011 4\u670822\u65e5 \u6c17\u8c61\u5e81\u767a\u8868\u306e\u5730\u9707\u306f\u300112\u6642\u307e\u30672\u56de\u3067\u3059\u3002\u203b22\u65e5 \u5348\u524d\u4e2d\u307e\u3067\u306e\u6709\u611f\u5730\u9707\u306f\u5c11\u306a\u304f\u306a\u3063\u3066\u3044\u308b\u3002\u5c0f\u7b20\u539f\u8af8\u5cf6\u306e\u7236\u5cf6\u8fd1\u6d77\u3067M4.9\u3001\u9707\u5ea62\u3068\u9752\u68ee\u770c\u6771\u65b9\u6c96\u3067M3.1\u3001\u9707\u5ea61\u3000\u6628\u65e5\u304b\u3089\u5730\u9707\u304c\u5c11\u306a\u3044\u4e2d\u3001M5.0\u3084M4.9\u306e\u5730\u9707\u304c\u767a\u4f5c\u7684\u306b\u8d77\u304d\u2026", "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>", "user": { "entities": { "description": { "urls": [] } }, "friends_count": 3838, "id_str": "297856618", "url": null, "default_profile": true, "is_translator": false, "id": 297856618, "description": "\u25c6\u76f8\u4e92\u30d5\u30a9\u30ed\u30fc\u975e\u5bfe\u5fdc\u25c6\u30d5\u30a9\u30ed\u30fc\u306fTL\u9061\u3063\u3066\u614e\u91cd\u306b\uff08RT\u975e\u8868\u793a\u63a8\u5968\u3001\u30ea\u30e0\u30fc\u30d6\u30fb\u30df\u30e5\u30fc\u30c8\u306f\u3054\u81ea\u7531\u306b\u3001\u30d6\u30ed\u30c3\u30af\u306b\u53f7\u6ce3\u25c6\u8b70\u8ad6\u304a\u65ad\u308a\u3000\u2461\u6b86\u3069\u306e\u545f\u304d\uff06RT\u306f\uff08\u4e2d\u306e\u4eba\uff09\uff1d\u306d\u3053\u30fb\u76f8\u68d2\u30fb\u76f8\u64b2\u30fb\u30b5\u30c3\u30ab\u30fc\u30fb\u7f8e\u8853\u7b49\u3002\u504f\u982d\u75db\u3002\u611a\u75f4\u30fb\u5b9f\u6cc1\u30fb\u60c5\u5831RT\u3000\u2460\u307e\u3061\u3083\u304a\u3058\u672c\u4eba\uff1d\u672c\u6765\u306e\u30e6\u30fc\u30b6\u3002\u30ea\u30a2\u30eb\u77e5\u4eba\u3068\u4ea4\u6d41\u3002\u52d5\u7269\u306b\u597d\u304b\u308c\u308b\u5317\u6d77\u9053\u7523\u5bd2\u304c\u308a\u3002\u30a2\u30a4\u30b3\u30f3\u306f\u53cb\u306e\u72ac", "profile_text_color": "333333", "notifications": false, "profile_background_tile": false, "profile_sidebar_fill_color": "DDEEF6", "following": false, "geo_enabled": false, "profile_use_background_image": true, "favourites_count": 1145840, "profile_image_url": "http://pbs.twimg.com/profile_images/710373355694592000/O9hNNqrn_normal.jpg", "is_translation_enabled": false, "follow_request_sent": false, "profile_link_color": "1DA1F2", "contributors_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/297856618/1406922190", "name": "\u307e\u3061\u3083\u304a\u3058", "profile_sidebar_border_color": "C0DEED", "time_zone": "Tokyo", "protected": false, "statuses_count": 854449, "profile_background_color": "C0DEED", "profile_image_url_https": "https://pbs.twimg.com/profile_images/710373355694592000/O9hNNqrn_normal.jpg", "verified": false, "listed_count": 84, "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "default_profile_image": false, "lang": "ja", "translator_type": "none", "has_extended_profile": false, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "screen_name": "macha_oji", "location": "\u6226\u4e89\u653e\u68c4\u3057\u305f\u5e73\u548c\u306e\u56fd\u306e\u9996\u90fd\u570f", "followers_count": 1913, "utc_offset": 32400, "created_at": "Fri May 13 07:19:59 +0000 2011" }, "place": null, "in_reply_to_screen_name": null, "coordinates": null, "contributors": null, "is_quote_status": false, "retweeted": false, "favorited": false, "in_reply_to_status_id": null, "lang": "ja", "in_reply_to_user_id": null, "in_reply_to_status_id_str": null, "retweet_count": 1, "created_at": "Sun Apr 22 11:47:37 +0000 2018", "in_reply_to_user_id_str": null }
We can convert the data into a dataframe for simpliufying its manipulation as well as exporting it to CSV for further processing:
df = pd.DataFrame([ {'id': result.id, 'created_at': result.created_at, 'user': '@'+result.user.name, 'text': result.text } for result in search_results])[['id', 'created_at', 'user', 'text']]
display(df.head())
id | created_at | user | text | |
---|---|---|---|---|
0 | 988021502216843264 | 2018-04-22 11:47:37 | @まちゃおじ | RT @ngcmk005: 【地震状況】 4月22日 気象庁発表の地震は、12時まで2回です... |
1 | 988021476304543745 | 2018-04-22 11:47:31 | @US Tribune News | Earthquake hits 41km NE of Visokoi Island, Sou... |
2 | 988021466317795328 | 2018-04-22 11:47:28 | @アン | RT @tenkijp_jishin: 22日20時36分頃、東京都で最大震度2を観測する地... |
3 | 988021449972629506 | 2018-04-22 11:47:24 | @声で本作りsmall world | RT @earthquake_jp: 【気象庁情報】22日 20時30分頃 新島・神津島近海... |
4 | 988021447359680514 | 2018-04-22 11:47:24 | @Noya Momose 百瀬直也 | 【地震情報】22日20:36 [ 最大震度 ] 震度 2 [ 震源地 ] 新島・神津島近海 ... |
Bezsides using keywords, we can also restrict the results within a particular location and given language:
# What is the weather 500mm around Lyon?:
keywords2 = ['weather' , 'forcast', 'sun', 'rain', 'clouds', 'storm']
# Only in english please !
lang = 'en'
# Get tweets around Lyon (latitide,longitude,radius):
geocode = '45.76,4.84,500km'
# Collect 1500 tweets using the keywords:
search_results2 = api.search(q=' OR '.join(keywords2), geocode=geocode, lang=lang, count=1500)
# Convert to GeoPandas:
df2 = pd.DataFrame([ {'id': result.id, 'created_at': result.created_at, 'user': '@'+result.user.name, 'text': result.text, 'geometry': result.coordinates } for result in search_results2])[['id', 'created_at', 'user', 'text', 'geometry']]
df2['geometry'] = df2['geometry'].apply(lambda coords: np.nan if coords is None else Point(coords['coordinates']))
df2 = df2.dropna() # Remove documents without geometry point (the twitter API may obtain location using user details rather than the tweet location.).
df2 = GeoDataFrame(df2, crs = {'init': 'epsg:2263'})
display(df2.head())
id | created_at | user | text | geometry | |
---|---|---|---|---|---|
1 | 988021547678892032 | 2018-04-22 11:47:48 | @Montpellier Weather | current weather in Montpellier: clear sky, 22°... | POINT (3.86 43.61) |
4 | 988020934551289857 | 2018-04-22 11:45:21 | @Lyon Weather | current weather in Lyon: clear sky, 26°C\n31% ... | POINT (4.84 45.75) |
7 | 988020676010196994 | 2018-04-22 11:44:20 | @Marseille Weather | current weather in Marseille: clear sky, 24°C\... | POINT (5.41 43.31) |
14 | 988020239836241925 | 2018-04-22 11:42:36 | @Geneva Weather | current weather in Geneva: clear sky, 24°C\n44... | POINT (6.14 46.21) |
21 | 988019312999854080 | 2018-04-22 11:38:55 | @Monaco Weather | current weather in Monaco: clear sky, 20°C\n72... | POINT (7.42 43.74) |
We can now display the tweets on a map in order to see the weather around Lyon. We can also do some basic keyword matching to display icons representing the weather:
# Create a map:
results2_map = folium.Map([45.76,4.84],
zoom_start=6,
tiles='Stamen Toner')
# Iterate over documents and add them the amp:
for index, document in tqdm_notebook(df2.iterrows()):
# Perform very simple matching of words for displaying the weather:
if 'sun' in document['text'].lower() or 'clear' in document['text'].lower():
icon_img = 'sun-o'
color = 'orange'
elif 'rain' in document['text'].lower() or 'showers' in document['text'].lower():
icon_img = 'umbrella'
color = 'blue'
elif 'strom' in document['text'].lower():
icon_img = 'bolt'
color = 'black'
elif 'cloud' in document['text'].lower():
icon_img = 'cloud'
color = 'gray'
else:
icon_img = 'info-circle'
color = 'green'
icon = folium.Icon(color=color, icon_color='white', icon=icon_img, angle=0, prefix='fa')
folium.Marker([document['geometry'].y, document['geometry'].x], popup=document['text'], icon=icon).add_to(results2_map)
display(results2_map)
HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))
In order to obtain data in realtime as new data is produced, it is necessary to use the Twitter streaming API. In this section we show how to collect tweets using the streaming API.
The first step is to create a listener that will colect the tweet and store them. In a real pipeline the tweet may be passed to another process as they are collected rather than stored.
# Create listener:
class TwitterStreamListener(tweepy.StreamListener):
def __init__(self, api=None, max_count=1000):
super(TwitterStreamListener, self).__init__(api)
self.max_count = max_count
self.count = 0
self._data = []
if self.max_count is not None:
self._pbar = tqdm_notebook(total=self.max_count, desc='Collecting tweets.')
self._pbar.clear()
# Retrun the data as a dataframe:
@property
def data(self):
results = self._data
df = pd.DataFrame([ {'id': result.id, 'created_at': result.created_at, 'user': '@'+result.user.name, 'text': result.text, 'geometry': result.coordinates } for result in results])[['id', 'created_at', 'user', 'text', 'geometry']]
df['geometry'] = df['geometry'].apply(lambda coords: np.nan if coords is None else Point(coords['coordinates']))
df = GeoDataFrame(df, crs = {'init': 'epsg:2263'})
return df
# Do something when a tweet is received:
def on_status(self, status):
# Stop collecting when max_count is reached:
self.count = self.count + 1
#print(str(self.count) + ': ' + status.text)
self._data.append(status)
if self.max_count is not None:
self._pbar.update()
if self.count == self.max_count:
self._pbar.close()
return False
We collect 10 tweets using the twitter API and convert it to a dataframe:
# Create a stream listener that collect 10 tweet:
stream_listener = TwitterStreamListener(max_count=10)
stream = tweepy.Stream(auth=api.auth, listener=stream_listener)
# Get earthquake related information:
keywords3 = ['earthquake', 'quake', 'magnitude', 'epicenter', 'magnitude', 'aftershock']
# Go fetch:
stream.filter(track=keywords3)
display(stream_listener.data.head())
HBox(children=(IntProgress(value=0, description='Collecting tweets.', max=10), HTML(value='')))
id | created_at | user | text | geometry | |
---|---|---|---|---|---|
0 | 988022034620862464 | 2018-04-22 11:49:44 | @Chintamoni Mallick | RT @Kill_BillPandey: Aise logo ke liye sarkar ... | NaN |
1 | 988022066501730304 | 2018-04-22 11:49:51 | @しがない産婦人科医 | RT @tenkijp_jishin: 22日20時36分頃、東京都で最大震度2を観測する地... | NaN |
2 | 988022076962365440 | 2018-04-22 11:49:54 | @raa. | RT @justforwanna1: [HQ] 180421 Kang Daniel\nCr... | NaN |
3 | 988022088584908800 | 2018-04-22 11:49:57 | @My Earthquake Alerts | 1.86 earthquake occurred 1km NNE of Pahala, Ha... | NaN |
4 | 988022117588287488 | 2018-04-22 11:50:04 | @ツイトク.JP | RT @tenkijp_jishin: 22日20時36分頃、東京都で最大震度2を観測する地... | NaN |
In this tutorial we rely on the CrisisLexT26 corpus. CrisisLexT26 is a dataset that contains Tweets from 26 crises, labeled by informativeness, information type and source. The collection includes tweets collected during 26 large crisis events in 2012 and 2013, with about 1,000 tweets labeled per crisis for informativeness (i.e. “informative," or "not informative"), information type, and source.
In this section we concatenate all the flood events :
files = {
'Philipinnes floods': '../data/2-data-collection/2012_Philipinnes_floods-tweets_labeled.csv',
'Alberta floods': '../data/2-data-collection/2013_Alberta_floods-tweets_labeled.csv',
'Colorado floods': '../data/2-data-collection/2013_Colorado_floods-tweets_labeled.csv',
'Manila floods': '../data/2-data-collection/2013_Manila_floods-tweets_labeled.csv',
'Queensland floods': '../data/2-data-collection/2013_Queensland_floods-tweets_labeled.csv',
'Sardinia floods': '../data/2-data-collection/2013_Sardinia_floods-tweets_labeled.csv' }
frames = []
for event, f in tqdm_notebook(files.items(), desc='Fetch data'):
frames.append(pd.read_csv(f))
data = pd.concat(frames, keys=files.keys())
display(data.head())
HBox(children=(IntProgress(value=0, description='Fetch data', max=6), HTML(value='')))
Tweet ID | Tweet Text | Information Source | Information Type | Informativeness | ||
---|---|---|---|---|---|---|
Colorado floods | 0 | 376843697943769088 | #Longmont #CO The Tiny Tim Center is now #hiri... | Not labeled | Not labeled | Not related |
1 | 378011169883037697 | RT @dlfluegge: Crazy Flooding in Boulder, Colo... | Media | Sympathy and support | Related - but not informative | |
2 | 378020179214491649 | Here's the #boulderflood video that's circulat... | Outsiders | Other Useful Information | Related and informative | |
3 | 378026101588496385 | RT @passantino: Video: Severe flooding hits ne... | Media | Other Useful Information | Related and informative | |
4 | 378029784204206080 | Crazy Flooding in Boulder, Colorado http://t.c... | Media | Other Useful Information | Related and informative |
Although the data has been manually annotated, lets ignore this information for the moment. We will use this information later on when trainning text classifiers. We can use off-the shelf tools for clustering the data.
For example, we can use topic models for clustering the data into topics automatically:
# Load sklearn and LDA (Latent Dirichlet Allocation):
from sklearn.decomposition import LatentDirichletAllocation
# Load word tokenisers and vectorisers:
from sklearn.feature_extraction.text import CountVectorizer
Before fitting an LDA model, we need to tokenise the data so each document is represented as word vectors (i.e., a vector that indicates how many instance of a word is in a document for each document). Later on in this tutorial we will invetigate other document representation using word embeddings.
# We split each documents by words using the default Sklearn tokenizer:
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
vects = count_vectorizer.fit_transform(data[' Tweet Text'])
vocabulary = count_vectorizer.get_feature_names()
# How many tokens?:
print('Vocabulary size: '+str(len(vocabulary)))
Vocabulary size: 5793
# We want to extract 5 topics:
nb_topics = 5
lda = LatentDirichletAllocation(n_components=nb_topics, learning_method='batch', random_state=42)
# We fit the model to the textual data:
lda.fit(vects)
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method='batch', learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=5, n_jobs=1, n_topics=None, perp_tol=0.1, random_state=42, topic_word_prior=None, total_samples=1000000.0, verbose=0)
Lets have a look at the topics extracted by LDA by displaying the top words for each topic:
n_top_words = 30
topic_words = {}
# Get the top words for each topic:
for topic, comp in enumerate(lda.components_):
word_idx = np.argsort(comp)[::-1][:n_top_words]
topic_words['Topic #'+str(topic)] = [vocabulary[i] for i in word_idx]
topic_words = pd.DataFrame.from_dict(topic_words)
display(topic_words)
Topic #0 | Topic #1 | Topic #2 | Topic #3 | Topic #4 | |
---|---|---|---|---|---|
0 | bigwet | rt | http | sardegna | http |
1 | rt | rescueph | floods | rt | rt |
2 | http | http | rt | http | flooding |
3 | qldfloods | reliefph | maringph | di | flood |
4 | na | help | colorado | la | floods |
5 | rescueph | need | manila | alluvione | colorado |
6 | qld | amp | flooding | il | yycflood |
7 | floodph | pls | sa | allertameteosar | coflood |
8 | maringph | flood | red | reliefph | abflood |
9 | flood | water | sardinia | le | calgary |
10 | brisbane | st | maring | non | boulderflood |
11 | sa | yycflood | heavy | olbia | australia |
12 | ng | relief | storm | che | alberta |
13 | water | hospital | rains | donations | amp |
14 | prayforthephilippines | rescue | metro | si | help |
15 | manila | food | dead | della | affected |
16 | just | needs | habagat | maringph | safe |
17 | ang | ust | ang | del | yyc |
18 | bundaberg | manila | na | forzasardegna | boulder |
19 | baha | new | cross | numeri | people |
20 | qpsmedia | use | rain | da | stay |
21 | st | hashtag | missing | 0789 | queensland |
22 | love | volunteers | people | emergenza | qld |
23 | alert | gmanews | tropical | al | news |
24 | queensland | send | state | ha | 000 |
25 | people | yyc | cyclone | philippines | victims |
26 | yycflood | mmda | mga | sono | river |
27 | river | marikina | ancalerts | prayforthephilippines | today |
28 | good | abflood | news | morti | evacuated |
29 | coast | city | pagasa | una | disaster |
Since the dataset is relatively small, the topics are not really well distributed. However we can see that the topics closely mach each event instance but have some sub-topics as well. For example, Topic 1 seems to be about needs and volunteering and topic 2 about victims.
Using those keywords we can set the following names for each topic:
topics = ['thoughts', 'needs_and_volunteering', 'victims', 'donnations', 'floods']
topic_words.columns = topics
display(topic_words.head())
thoughts | needs_and_volunteering | victims | donnations | floods | |
---|---|---|---|---|---|
0 | bigwet | rt | http | sardegna | http |
1 | rt | rescueph | floods | rt | rt |
2 | http | http | rt | http | flooding |
3 | qldfloods | reliefph | maringph | di | flood |
4 | na | help | colorado | la | floods |
Using the previous topics, we can no associate a label to each document show the distribution of topics in a given dataset. For the Queensland floods, we have the following topic distribution:
queensland_data = pd.read_csv('../data/2-data-collection/2013_Queensland_floods-tweets_labeled.csv')
topic_preds = [topics[p] for p in np.argmax(lda.transform(count_vectorizer.transform(queensland_data[' Tweet Text'])), axis=1)]
We can also plot the distribution in order to better understand the situation:
pd.Series(topic_preds).value_counts(normalize=True).plot.barh()
plt.title('Topic distribution in Queensland floods', fontsize=20)
plt.xlabel('Proportion')
plt.ylabel('Topic')
plt.show()
Given the data distribution we can now focus on particular topic of interres. For example we want to see what documents are mentionning victims:
classified_queensland_data = queensland_data.assign(topic=topic_preds)
classified_queensland_data.loc[classified_queensland_data['topic'] == 'victims']
Tweet ID | Tweet Text | Information Source | Information Type | Informativeness | topic | |
---|---|---|---|---|---|---|
34 | 294609615847571456 | Queensland floods as cyclone sweeps past http:... | Outsiders | Other Useful Information | Related and informative | victims |
46 | 294720559403651072 | VIDEO: Tropical storm strikes Queensland: Trop... | Media | Other Useful Information | Related and informative | victims |
47 | 294720815243612160 | VIDEO: Tropical storm strikes Queensland: Trop... | Outsiders | Other Useful Information | Related and informative | victims |
97 | 295085275087003648 | Severe Weather Warning issued for #Scenicrim #... | Media | Caution and advice | Related and informative | victims |
207 | 295409335432007682 | Queensland braces for flooding: The Australian... | Media | Caution and advice | Related and informative | victims |
208 | 295409385772044288 | Queensland braces for flooding: The Australian... | Outsiders | Caution and advice | Related and informative | victims |
225 | 295419355632635904 | RT @AustralianNews: Queensland records first f... | Media | Affected individuals | Related and informative | victims |
348 | 295480454025465856 | @craiglowndes888 So you're missing out on the... | Outsiders | Other Useful Information | Related - but not informative | victims |
367 | 295490268705193985 | A person have to see this. Can't cease laughi... | Not labeled | Not labeled | Not related | victims |
368 | 295491501822197760 | http://t.co/uwJfTLgB Launch me personally from... | Not labeled | Not labeled | Not related | victims |
413 | 295525060477870081 | BBC News - Australian state of Queensland brac... | Media | Other Useful Information | Related and informative | victims |
430 | 295541346951892992 | Amazing -- thanks -- this kind of totally rock... | Not labeled | Not labeled | Not related | victims |
432 | 295544836604456960 | RT @TelegraphNews: Severe floods and tornadoes... | Media | Affected individuals | Related and informative | victims |
450 | 295574939128442880 | #bigwet http://t.co/hjNwGg5e huh?! he killed y... | Not labeled | Not labeled | Not related | victims |
540 | 295683596767883265 | Access to Stradbroke Island cut for second day... | Media | Infrastructure and utilities | Related and informative | victims |
550 | 295690588676816897 | Flooding hits eastern Australia: Hundreds of h... | Media | Affected individuals | Related and informative | victims |
551 | 295691150713569280 | BBC : Flooding hits eastern Australia: Hundred... | Media | Affected individuals | Related and informative | victims |
553 | 295691494629711872 | Flooding hits eastern Australia: Hundreds of h... | Media | Affected individuals | Related and informative | victims |
554 | 295691591094509569 | Flooding hits eastern Australia: Hundreds of h... | Media | Infrastructure and utilities | Related and informative | victims |
555 | 295691662431232000 | Flooding hits eastern Australia: Hundreds of h... | Media | Affected individuals | Related and informative | victims |
618 | 295738449888157696 | Flooding hits eastern Australia: Three people ... | Media | Affected individuals | Related and informative | victims |
653 | 295756598637367296 | Queensland's flood crisis deepens as death tol... | Media | Affected individuals | Related and informative | victims |
681 | 295769412244488192 | Feel for the people of QLD again hit by floods | Outsiders | Sympathy and support | Related - but not informative | victims |
704 | 295784322982637568 | http://t.co/5jVRt0y8: Australian state of Quee... | Media | Other Useful Information | Related and informative | victims |
708 | 295786793436069888 | Three killed in Queensland floods - Irish Time... | Media | Affected individuals | Related and informative | victims |
712 | 295792170525405184 | Three killed in Queensland floods - Irish Time... | Media | Affected individuals | Related and informative | victims |
719 | 295797644104695809 | Australian state of Queensland hit by deadly f... | Media | Affected individuals | Related and informative | victims |
720 | 295798008979804160 | Australian state of Queensland hit by deadly f... | Media | Other Useful Information | Related and informative | victims |
728 | 295805072191918080 | VIDEO: Floods cause chaos in Queensland: Three... | Outsiders | Affected individuals | Related and informative | victims |
729 | 295805172846821377 | VIDEO: Floods cause chaos in Queensland: Three... | Media | Affected individuals | Related and informative | victims |
731 | 295805432897880064 | BBC : VIDEO: Floods cause chaos in Queensland ... | Media | Other Useful Information | Related and informative | victims |
756 | 295822101061971968 | RT @AustralianNews: Queensland records first f... | Media | Affected individuals | Related and informative | victims |
770 | 295841675891339264 | Australia floods leave hundreds stranded in Bu... | Media | Affected individuals | Related and informative | victims |
779 | 295853344457641984 | RT @guardianworld: First fire, now floods: Aus... | Media | Other Useful Information | Related and informative | victims |
784 | 295861389124308993 | Poser RT @MissKayeSera: TWO sand bags, Tony. T... | Outsiders | Other Useful Information | Related and informative | victims |
851 | 296040611746824192 | Blue skies but still no power #bigwet #nopower | Eyewitness | Infrastructure and utilities | Related and informative | victims |
887 | 296088749761257472 | #QLDfloods #floods @ Holmview http://t.co/SOp... | Not labeled | Not labeled | Not applicable | victims |
945 | 296143556727435264 | Hospital patients flee rising floods: Heavy fl... | Outsiders | Affected individuals | Related and informative | victims |
946 | 296145469317451778 | Hospital patients flee rising floods: Heavy fl... | Media | Affected individuals | Related and informative | victims |
947 | 296145754542718976 | Hospital patients flee rising floods: Heavy fl... | Media | Affected individuals | Related and informative | victims |
983 | 296214356595703808 | Hospital patients flee Australia floods: Heavy... | Media | Affected individuals | Related and informative | victims |
1009 | 296269364854935553 | @iwasmadetolaugh but there's floods in queensl... | Outsiders | Not applicable | Related - but not informative | victims |
1086 | 296553944191209472 | Queensland flood toll rises to six: Queensland... | Media | Affected individuals | Related and informative | victims |
1094 | 296644994155040768 | Australia floods recede as death toll rises: J... | Media | Affected individuals | Related and informative | victims |
1151 | 297254644625838080 | Wisma Putra Malaysian victim of Queensland flo... | Media | Affected individuals | Related and informative | victims |