This notebook shows how to analyze a collection of passages like Tweets for sentiment.
This is based on Neal Caron's An introduction to text analysis with Python, Part 3.
Here we will get the data to test and our positive and negative dictionaries.
** Getting by URL **
Here you can see how to get the data by URL. This shows just how to get the negative words. The others are positive.txt and obama_tweets.txt .
import urllib.request
path = 'http://www.unc.edu/~ncaren/haphazard/negative.txt'
with urllib.request.urlopen(path) as response:
negData = response.read()
print(negData[:100])
b'abandoned\nabandonment\naberration\naberration\nabhorred\nabhorrence\nabhorrent\nabhorrently\nabhors\nabhors\n'
** Opening data files **
Instead of getting the data from the web in the script, I suggest you just download it and save it in the folder your scrip is in. Here are the links
Assuming you have saved in the folder with your script. Here is how to see what is in the folder.
%ls
Collocates-Copy1.ipynb SimpleSentimentAnalysis.ipynb Collocates.ipynb Untitled.ipynb ComplexSentimentAnalysis.ipynb negative.txt Handling Texts.ipynb obama_tweets.txt Hume Enquiry.txt positive.txt Hume Treatise.txt
Now we load the negative words.
with open("negative.txt", "r") as f:
negText = f.read()
negTokens = negText.split("\n") # This splits the text file into tokens on the new line character
negTokens[-1:] = [] # This strips out the final empty item
print(negTokens[-10:])
['wretchedly', 'wretchedness', 'wrong', 'wrongful', 'wrought', 'wrought', 'yawn', 'zealot', 'zealous', 'zealously']
Here we load the positive words.
with open("positive.txt", "r") as f:
posText = f.read()
posTokens = posText.split("\n") # This splits the text file into tokens on the new line character
posTokens[-1:] = [] # This strips out the final empty item
print(posTokens[-10:])
['worthwhile', 'worthy', 'wow', 'wry', 'yearning', 'yearningly', 'youthful', 'zeal', 'zenith', 'zest']
Here we get the tweets.
with open("obama_tweets.txt", "r") as f:
tweetsText = f.read()
tweetsTokens = tweetsText.split("\n") # This splits the text file into tokens on the new line character
tweetsTokens[-1:] = [] # This strips out the final empty item
print(tweetsTokens[:2])
['Obama has called the GOP budget social Darwinism. Nice try, but they believe in social creationism.', 'In his teen years, Obama has been known to use marijuana and cocaine.']
Now we will create two functions. The first for tokenizing a tweet, the second for calculating positive/negative words.
import re
def tokenizer(theText):
theTokens = re.findall(r'\b\w[\w-]*\b', theText.lower())
return theTokens
def calculator(theTweet):
# Count positive words
numPosWords = 0
theTweetTokens = tokenizer(theTweet)
for word in theTweetTokens:
if word in posTokens:
numPosWords += 1
# Count negative words
numNegWords = 0
for word in theTweetTokens:
if word in negTokens:
numNegWords += 1
sum = (numPosWords - numNegWords)
return sum
# Here is a line for testing this
# print(calculator('Obama has called wrong wrong the GOP budget social Darwinism. Nice try, but they believe in social creationism.'))
Now we will use the calculator to calculate how many positive and negative tweets.
Note: that you can set a threshold for the number of words for a Tweet to be considered positive or negative.
# Here we set up the thresholds
posi = 1 # This means there have to be more than 1 positive word
nega = 0 # This means there has to be more than 1 negative words
# Here we prime our variables
numTweets = 0
numPosTweets = 0
numNegTweets = 0
numNeutTweets = 0
# This loop goes through all the Tweets and calculates if sums the number of positive or negative ones.
for tweet in tweetsTokens:
calc = calculator(tweet)
if calc > posi:
numPosTweets += 1
numTweets += 1
elif calc < nega:
numNegTweets += 1
numTweets += 1
else:
numNeutTweets += 1
numTweets += 1
# This prints out the results
print("Total: " + str(numTweets) + "\n" + "Positive: " + str(numPosTweets) + "\n" + "Neutral: " + str(numNeutTweets) + "\n" + "Negative: " +str(numNegTweets))
Total: 1380 Positive: 81 Neutral: 1047 Negative: 252
This will gather all examples of positive tweets.
# Here we set up the threshold.
posi = 1 # This means there have to be more than 1 positive word
numberWanted = 4 # Here you decide how many tweets you want
# Here we prime our variables
numTweets = 0
numPosTweets = 0
posiTweetList = []
# This loop goes through all the Tweets and calculates if sums the number of positive or negative ones.
for tweet in tweetsTokens:
calc = calculator(tweet)
if calc > posi and numPosTweets < numberWanted:
numPosTweets += 1
posiTweetList.append(tweet)
print(posiTweetList)
["#WhatsRomneyHiding? Obama's dignity and sense of humor? #p2 #tcot", "RealClearPolitics - Obama's Organizational Advantage on Full ...: As a small but electorally significant state t... http://t.co/3Ax22aBB", "RT @wilycyotee Pres. Obama's ongoing support of women is another reason I am so proud he is my President! @edshow #Obama2012", 'If Obama win 2012 Election wait til 2016 he will have full white hair! just like Bill clinton!']
This will gather examples of negative tweets.
# Here we set up the threshold.
nega = -1 # This means there have to be more than 1 positive word
numberWanted = 4 # Here you decide how many tweets you want
# Here we prime our variables
numTweets = 0
numNegTweets = 0
negaTweetList = []
# This loop goes through all the Tweets and calculates if sums the number of positive or negative ones.
for tweet in tweetsTokens:
calc = calculator(tweet)
if calc < nega and numNegTweets < numberWanted:
numNegTweets += 1
negaTweetList.append(tweet)
print(negaTweetList)
['President Obama < Lindsay Lohan RUMORS beginning cross shape lights on ST < 1987 Analyst64 DC bicycle courier < Video changes to scramble.', '@edshow the direspect of President #Obama is based on racism. They do not want a Black PRESIDENT. #edshow', '@JoeSixpackSays Our Troops NEED TO COME HOME !!!! BRING OUR TROOPS HOME NOW OBAMA YOU BASTARD OOOORAH SEMPER FI', 'Attorney Mario Apuzzo Files Ballot Access Challenge Against Obama Today in New Jersey http://t.co/06rD6lCL']
Here you can take a tweet and test it to see how many positive or negative words it has.
tweetToCalc = input("What is the tweet to calculate? ")
print(calculator(tweetToCalc))
What is the tweet to calculate? the rest is ok bad bad bad -3
This will gather the words that are positive in the tweets and tabulate them.
import re
posWordsList = []
negWordsList = []
def tokenizer(theText):
theTokens = re.findall(r'\b\w[\w-]*\b', theText.lower())
return theTokens
def wordsCalculator(theTweet):
# Count positive words
numPosWords = 0
theTweetTokens = tokenizer(theTweet)
for word in theTweetTokens:
if word in posTokens:
numPosWords += 1
posWordsList.append(word)
# Count negative words
numNegWords = 0
for word in theTweetTokens:
if word in negTokens:
numNegWords += 1
negWordsList.append(word)
tweet2Process = input("What tweet do you want to process? ")
wordsCalculator(tweet2Process)
print("Positive words: " + str(posWordsList[:10]))
print("Negative words: " + str(negWordsList[:10]))
What tweet do you want to process? Positive words: [] Negative words: []
import re
# Here we set up the thresholds
posi = 1 # This means there have to be more than 1 positive word
nega = 0 # This means there has to be more than 1 negative words
# Here we prime our variables
posWordsList = []
negWordsList = []
numTweets = 0
numPosTweets = 0
numNegTweets = 0
numNeutTweets = 0
def wordsGathering(theTweet):
# Count positive words
numPosWords = 0
theTweetTokens = tokenizer(theTweet)
for word in theTweetTokens:
if word in posTokens:
numPosWords += 1
posWordsList.append(word)
# Count negative words
numNegWords = 0
for word in theTweetTokens:
if word in negTokens:
numNegWords += 1
negWordsList.append(word)
sum = (numPosWords - numNegWords)
return sum
# This loop goes through all the Tweets and calculates if sums the number of positive or negative ones.
for tweet in tweetsTokens:
calc = wordsGathering(tweet)
if calc > posi:
numPosTweets += 1
numTweets += 1
elif calc < nega:
numNegTweets += 1
numTweets += 1
else:
numNeutTweets += 1
numTweets += 1
print("Positive words: " + str(len(posWordsList)))
print("Negative words: " + str(len(negWordsList)))
Positive words: 756 Negative words: 552
import nltk, matplotlib
posDist = nltk.FreqDist(posWordsList)
posDist.tabulate(10)
%matplotlib inline
posDist.plot(25, title="Top Positive Words")
will just supreme white important good right nice interest support 66 44 32 31 26 18 17 17 16 16
import nltk, matplotlib
negDist = nltk.FreqDist(negWordsList)
negDist.tabulate(10)
%matplotlib inline
negDist.plot(25, title="Top Negative Words")
need bully against trying war dumb down rumors bad less 26 19 14 13 11 11 11 10 9 8