Author: Matthew Huh
The objective of this program is to retrieve tweets about each airline from twitter. Since the twitter API limits access to tweets from the past week, the data is time sensitive, and non-repeatable. The output of this program will be a csv file that will be used as the testing set for the main project. To see how this dataset will be utilized, please view the project below.
Sentiment Analysis using Airline Tweets https://github.com/mhuh22/Thinkful/blob/master/Bootcamp/Unit%207/Sentiment%20Analysis%20using%20Airline%20Tweets.ipynb
import tweepy
import twitter_credentials
from textblob import TextBlob
import pandas as pd
import time
import datetime
import os
# Entering credentials to utilize Tweepy
auth = tweepy.OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)
auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
# View search rate limit
data = api.rate_limit_status()
print(data['resources']['search']['/search/tweets'])
{'limit': 180, 'remaining': 180, 'reset': 1543528259}
class listener (tweepy.StreamListener):
def on_data(self, data, time_limit=1):
self.start_time = time.time()
self.limit = time_limit
try:
print(data)
saveFile = open('twitDB.csv', 'a')
saveFile.write(data)
saveFile.write('\n')
saveFile.close()
return(True)
except BaseException:
print('failed ondata', str(e))
time.sleep(5)
def on_error(self, status):
print(status)
# Create dataframe to store twitter information
tweet_df = pd.DataFrame(columns=['airline', 'text'])
# A list of the official airline accounts
trained_airlines = ['@AmericanAir', '@Delta', '@SouthwestAir', '@united']
test_airlines = ['@AlaskaAir','@Allegiant','@FlyFrontier','@HawaiianAir','@JetBlue', '@SpiritAirlines']
airlines = sorted(trained_airlines + test_airlines)
# View list of all airlines
print(airlines)
['@AlaskaAir', '@Allegiant', '@AmericanAir', '@Delta', '@FlyFrontier', '@HawaiianAir', '@JetBlue', '@SouthwestAir', '@SpiritAirlines', '@united']
# Access data for each airline, and append to the dataframe
# Query limit is 180 per 15 minutes, so this is the most we can do
for i in range(18):
# Query each airline and retrieve up to 100 tweets
for airline in airlines:
airline_tweets = api.search(airline, count=100)
# Append each tweet to the dataframe
for tweet in airline_tweets:
tweet_df = tweet_df.append({'airline': airline, 'text':tweet.text}, ignore_index=True)
# View search rate limit
data = api.rate_limit_status()
print(data['resources']['search']['/search/tweets'])
{'limit': 180, 'remaining': 0, 'reset': 1543528259}
# View the shape of the dataframe
tweet_df.shape
(18000, 2)
# Preview the data to verify it worked
tweet_df.head()
airline | text | |
---|---|---|
0 | @AlaskaAir | RT @AlaskaAir: Happy birthday to our Chief Foo... |
1 | @AlaskaAir | @AlaskaAir My monthly+ flughts in and out of M... |
2 | @AlaskaAir | RT @ChrisEgan5: UW #Husky fans fired up for @p... |
3 | @AlaskaAir | @ChrisEgan5 @mcclainfan59 @pac12 @AlaskaAir @U... |
4 | @AlaskaAir | @AlaskaAir Stupid, stupid decision!! |
# Save extracted data to a local directory
if os.path.exists('airline_tweets/test_set.csv'):
os.remove('airline_tweets/test_set.csv')
tweet_df.to_csv('airline_tweets/test_set.csv', encoding='utf-8')
This file requires a separate file called twitter_credentials.py with the following format
CONSUMER_KEY = CONSUMER_SECRET = ACCESS_TOKEN = ACCESS_TOKEN_SECRET =
Find credentials here after logging in https://apps.twitter.com/app/15976800/keys