Import Libraries

In [1]:
import tweepy
import json
import pandas as pd
from scipy.misc import imread
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib as mpl
import csv
import matplotlib.pyplot as plt

import operator
from textblob import TextBlob
from textblob import Word
from textblob.sentiments import NaiveBayesAnalyzer

Authentication

In [2]:
consumer_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
consumer_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
access_token = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxxxxxxx'
access_token_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret) #Interacting with twitter's API
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API (auth) #creating the API object

Extracting Tweets

In [3]:
results = []
for tweet in tweepy.Cursor (api.search, q = 'millennials', lang = "en").items(2000): 
    results.append(tweet)
    
print (type(results))
print (len(results))
#print (results[4000].text)
<class 'list'>
2000

Store Data in dataframe

In [4]:
def tweets_df(results):
    id_list = [tweet.id for tweet  in results]
    data_set = pd.DataFrame(id_list, columns = ["id"])
    
    data_set["text"] = [tweet.text for tweet in results]
    data_set["created_at"] = [tweet.created_at for tweet in results]
    data_set["retweet_count"] = [tweet.retweet_count for tweet in results]
    data_set["user_screen_name"] = [tweet.author.screen_name for tweet in results]
    data_set["user_followers_count"] = [tweet.author.followers_count for tweet in results]
    data_set["user_location"] = [tweet.author.location for tweet in results]
    data_set["Hashtags"] = [tweet.entities.get('hashtags') for tweet in results]
    
    return data_set
data_set = tweets_df(results)
    

Remove duplicate tweets

In [5]:
text = data_set["text"]

for i in range(0,len(text)):
    txt = ' '.join(word for word in text[i] .split() if not word.startswith('https:'))
    data_set.set_value(i, 'text2', txt)
    
data_set.drop_duplicates('text2', inplace=True)
data_set.reset_index(drop = True, inplace=True)
data_set.drop('text', axis = 1, inplace = True)
data_set.rename(columns={'text2': 'text'}, inplace=True)

Sentiment Analysis of tweets

In [6]:
text = data_set["text"]

for i in range(0,len(text)):
    textB = TextBlob(text[i])
    sentiment = textB.sentiment.polarity
    data_set.set_value(i, 'Sentiment',sentiment)
    if sentiment <0.00:
        SentimentClass = 'Negative'
        data_set.set_value(i, 'SentimentClass', SentimentClass )
    elif sentiment >0.00:
        SentimentClass = 'Positive'
        data_set.set_value(i, 'SentimentClass', SentimentClass )
    else:
        SentimentClass = 'Neutral'
        data_set.set_value(i, 'SentimentClass', SentimentClass )
In [7]:
data_set.to_csv("C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\Millennials.csv")

Extract all hashtags for all tweets

In [8]:
Htag_df = pd.DataFrame()
j = 0

for tweet in range(0,len(results)):
    hashtag = results[tweet].entities.get('hashtags')
    for i in range(0,len(hashtag)):
        Htag = hashtag[i]['text'] 
        Htag_df.set_value(j, 'Hashtag',Htag)
        j = j+1
In [9]:
Htag_df
Out[9]:
Hashtag
0 vc
1 venture
2 talent
3 manufacturers
4 mobility
5 fintech
6 finserv
7 banking
8 in
9 College
10 Millennials
11 Millennials
12 babyboomers
13 Millennials
14 babyboomers
15 Millennials
16 GenZ
17 SupplyChain
18 AR
19 VR
20 IoT
21 Wearables
22 Millennials
23 AI
24 2A
25 marketing
26 marketing
27 fox
28 OutNumbered
29 ConceptCar
... ...
1528 LaborDay
1529 food
1530 fastfood
1531 Millennials
1532 Podcast
1533 GG
1534 Boomers
1535 GenX
1536 Millennials
1537 Teen
1538 College
1539 nextgen
1540 millennials
1541 entrepreneur
1542 nextgen
1543 millennials
1544 kbye
1545 digitalPR
1546 Polls
1547 Millennials
1548 meat
1549 millennials
1550 Levo
1551 Millennials
1552 meat
1553 millennials
1554 food
1555 housing
1556 GenZ
1557 Millennials

1558 rows × 1 columns

In [16]:
Millennials_Htag_wordcloud = Htag_df.groupby('Hashtag').size()
Millennials_Htag_wordcloud.to_csv("C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\Millennials_Htag_wordcloud.csv")
In [10]:
# Join all the text from the 1000 tweets
Hashtag_Combined = " ".join(Htag_df['Hashtag'].values.astype(str))

no_millennials = " ".join([word for word in Hashtag_Combined.split()
                                if word != 'millennials'
                                and word != 'Millennials'
                                and word != 'Boomers'
                                and word != 'GenX'
                                                                
                                ])

Tweet_mask = imread("C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\twitter_mask.png", flatten=True)

#Create a Word Cloud
wc = WordCloud(background_color="white", stopwords=STOPWORDS, mask = Tweet_mask)
wc.generate(no_millennials)
plt.imshow(wc)
plt.axis("off")
plt.savefig('C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\millennials_Hashtag.png', dpi=300)
plt.show()
C:\Continuum\Anaconda3\lib\site-packages\wordcloud\wordcloud.py:372: UserWarning: mask image should be unsigned byte between 0 and 255. Got a float array
  warnings.warn("mask image should be unsigned byte between 0"