In [1]:
'''

This script reads the following inputs.
- Our processed tweets ie  (O/P of 02_Append_All_Tweets_together.ipynb)
- Readily available humor corpus(positive as well as negative class) - Oneliners, Proverbs, Wikipedia sentences, Reuters news headlines. Found at https://github.com/CrowdTruth/Short-Text-Corpus-For-Humor-Detection

It creates a final dataframe with (sentence,class) columns that we will use later for modelling.
'''

#Import required libraries
import pandas as pd
import random
In [2]:
#Read in tweets. 
tweets     = pd.read_csv('datasets/Tweets_combined_20190714.csv')
tweets     = tweets['tweet_text']

#Also read in additional humor corpus found on https://github.com/CrowdTruth/Short-Text-Corpus-For-Humor-Detection
oneliners  = pd.read_pickle('datasets/humorous_oneliners.pickle')
proverbs   = pd.read_pickle('datasets/proverbs.pickle')
wiki       = pd.read_pickle('datasets/wiki_sentences.pickle')
reuters    = pd.read_pickle('datasets/reuters_headlines.pickle')
In [3]:
#Note - We will treat postive class as 1) our tweets & 2) oneliners from humor corpus
#Define postive and negative class labels
oneliner_record = [(sent, 1) for sent in oneliners]
tweet_record    = [(tweet_text, 1) for tweet_text in tweets]
proverb_record = [(sent, 0) for sent in proverbs]
wiki_record = [(sent, 0) for sent in wiki]
reuter_record = [(sent, 0) for sent in reuters]



positive_record = oneliner_record + tweet_record
negative_record = wiki_record + proverb_record + reuter_record
columns = ['sentence', 'class']
In [4]:
#We have more negative class records. Just chose n where n=positive class records. We'll model this as balanced dataset
random.shuffle(negative_record)
negative_record = negative_record[:len(positive_record)]

#Create dataframe for modelling
df_record = positive_record + negative_record 
df = pd.DataFrame(df_record, columns=columns)

#Randomly shuffle it & reset the index
df = df.sample(frac=1).reset_index(drop=True)
In [5]:
#Save this. It will be used for modeliing in next script
df.to_csv('datasets/Final_data.csv',index=False)
In [ ]: