import tweepy
import json
import pandas as pd
from scipy.misc import imread
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib as mpl
import csv
import matplotlib.pyplot as plt
import operator
from textblob import TextBlob
from textblob import Word
from textblob.sentiments import NaiveBayesAnalyzer
consumer_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
consumer_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
access_token = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxxxxxxx'
access_token_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) #Interacting with twitter's API
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API (auth) #creating the API object
results = []
for tweet in tweepy.Cursor (api.search, q = 'millennials', lang = "en").items(2000):
results.append(tweet)
print (type(results))
print (len(results))
#print (results[4000].text)
<class 'list'> 2000
def tweets_df(results):
id_list = [tweet.id for tweet in results]
data_set = pd.DataFrame(id_list, columns = ["id"])
data_set["text"] = [tweet.text for tweet in results]
data_set["created_at"] = [tweet.created_at for tweet in results]
data_set["retweet_count"] = [tweet.retweet_count for tweet in results]
data_set["user_screen_name"] = [tweet.author.screen_name for tweet in results]
data_set["user_followers_count"] = [tweet.author.followers_count for tweet in results]
data_set["user_location"] = [tweet.author.location for tweet in results]
data_set["Hashtags"] = [tweet.entities.get('hashtags') for tweet in results]
return data_set
data_set = tweets_df(results)
text = data_set["text"]
for i in range(0,len(text)):
txt = ' '.join(word for word in text[i] .split() if not word.startswith('https:'))
data_set.set_value(i, 'text2', txt)
data_set.drop_duplicates('text2', inplace=True)
data_set.reset_index(drop = True, inplace=True)
data_set.drop('text', axis = 1, inplace = True)
data_set.rename(columns={'text2': 'text'}, inplace=True)
text = data_set["text"]
for i in range(0,len(text)):
textB = TextBlob(text[i])
sentiment = textB.sentiment.polarity
data_set.set_value(i, 'Sentiment',sentiment)
if sentiment <0.00:
SentimentClass = 'Negative'
data_set.set_value(i, 'SentimentClass', SentimentClass )
elif sentiment >0.00:
SentimentClass = 'Positive'
data_set.set_value(i, 'SentimentClass', SentimentClass )
else:
SentimentClass = 'Neutral'
data_set.set_value(i, 'SentimentClass', SentimentClass )
data_set.to_csv("C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\Millennials.csv")
Htag_df = pd.DataFrame()
j = 0
for tweet in range(0,len(results)):
hashtag = results[tweet].entities.get('hashtags')
for i in range(0,len(hashtag)):
Htag = hashtag[i]['text']
Htag_df.set_value(j, 'Hashtag',Htag)
j = j+1
Htag_df
Hashtag | |
---|---|
0 | vc |
1 | venture |
2 | talent |
3 | manufacturers |
4 | mobility |
5 | fintech |
6 | finserv |
7 | banking |
8 | in |
9 | College |
10 | Millennials |
11 | Millennials |
12 | babyboomers |
13 | Millennials |
14 | babyboomers |
15 | Millennials |
16 | GenZ |
17 | SupplyChain |
18 | AR |
19 | VR |
20 | IoT |
21 | Wearables |
22 | Millennials |
23 | AI |
24 | 2A |
25 | marketing |
26 | marketing |
27 | fox |
28 | OutNumbered |
29 | ConceptCar |
... | ... |
1528 | LaborDay |
1529 | food |
1530 | fastfood |
1531 | Millennials |
1532 | Podcast |
1533 | GG |
1534 | Boomers |
1535 | GenX |
1536 | Millennials |
1537 | Teen |
1538 | College |
1539 | nextgen |
1540 | millennials |
1541 | entrepreneur |
1542 | nextgen |
1543 | millennials |
1544 | kbye |
1545 | digitalPR |
1546 | Polls |
1547 | Millennials |
1548 | meat |
1549 | millennials |
1550 | Levo |
1551 | Millennials |
1552 | meat |
1553 | millennials |
1554 | food |
1555 | housing |
1556 | GenZ |
1557 | Millennials |
1558 rows × 1 columns
Millennials_Htag_wordcloud = Htag_df.groupby('Hashtag').size()
Millennials_Htag_wordcloud.to_csv("C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\Millennials_Htag_wordcloud.csv")
# Join all the text from the 1000 tweets
Hashtag_Combined = " ".join(Htag_df['Hashtag'].values.astype(str))
no_millennials = " ".join([word for word in Hashtag_Combined.split()
if word != 'millennials'
and word != 'Millennials'
and word != 'Boomers'
and word != 'GenX'
])
Tweet_mask = imread("C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\twitter_mask.png", flatten=True)
#Create a Word Cloud
wc = WordCloud(background_color="white", stopwords=STOPWORDS, mask = Tweet_mask)
wc.generate(no_millennials)
plt.imshow(wc)
plt.axis("off")
plt.savefig('C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\millennials_Hashtag.png', dpi=300)
plt.show()
C:\Continuum\Anaconda3\lib\site-packages\wordcloud\wordcloud.py:372: UserWarning: mask image should be unsigned byte between 0 and 255. Got a float array warnings.warn("mask image should be unsigned byte between 0"