'''
This script reads the following inputs.
- Our processed tweets ie (O/P of 02_Append_All_Tweets_together.ipynb)
- Readily available humor corpus(positive as well as negative class) - Oneliners, Proverbs, Wikipedia sentences, Reuters news headlines. Found at https://github.com/CrowdTruth/Short-Text-Corpus-For-Humor-Detection
It creates a final dataframe with (sentence,class) columns that we will use later for modelling.
'''
#Import required libraries
import pandas as pd
import random
#Read in tweets.
tweets = pd.read_csv('datasets/Tweets_combined_20190714.csv')
tweets = tweets['tweet_text']
#Also read in additional humor corpus found on https://github.com/CrowdTruth/Short-Text-Corpus-For-Humor-Detection
oneliners = pd.read_pickle('datasets/humorous_oneliners.pickle')
proverbs = pd.read_pickle('datasets/proverbs.pickle')
wiki = pd.read_pickle('datasets/wiki_sentences.pickle')
reuters = pd.read_pickle('datasets/reuters_headlines.pickle')
#Note - We will treat postive class as 1) our tweets & 2) oneliners from humor corpus
#Define postive and negative class labels
oneliner_record = [(sent, 1) for sent in oneliners]
tweet_record = [(tweet_text, 1) for tweet_text in tweets]
proverb_record = [(sent, 0) for sent in proverbs]
wiki_record = [(sent, 0) for sent in wiki]
reuter_record = [(sent, 0) for sent in reuters]
positive_record = oneliner_record + tweet_record
negative_record = wiki_record + proverb_record + reuter_record
columns = ['sentence', 'class']
#We have more negative class records. Just chose n where n=positive class records. We'll model this as balanced dataset
random.shuffle(negative_record)
negative_record = negative_record[:len(positive_record)]
#Create dataframe for modelling
df_record = positive_record + negative_record
df = pd.DataFrame(df_record, columns=columns)
#Randomly shuffle it & reset the index
df = df.sample(frac=1).reset_index(drop=True)
#Save this. It will be used for modeliing in next script
df.to_csv('datasets/Final_data.csv',index=False)