#!/usr/bin/env python # coding: utf-8 # # @ObviousOstrich Generation Experiment # - @obviousostrich is a twitter account [here](!https://twitter.com/search?q=obvious%20ostrich&src=tyah) # - Collected all it's tweets 17k tweets using Twitter API. # - I wanted to do a small text generation experiment anyway. # - **I was skeptical how a bigram-trigram model will work on a little dataset like this. It works nice on reuters dataset.** # - **Surprisingly, it does okay.** The reason is the same reason I thought it would fail i.e. it has very little repeatition of words, so it has merged two or max three tweets into one, generating some funny obvious work. # - The **biggest downside** of working on this small a dataset is that sometimes it just produces the exaact tweet. # # In[2]: import pandas as pd # In[6]: df = pd.read_csv('procpos.csv') # In[109]: df.head(3) # In[23]: import spacy nlp = spacy.load('en') # In[62]: tweets = [] tweet_wordlist = [] #for a list of list representation of sentences i.e. each list of words is a sentence. #everything to lowercase for row in df['tweets']: tweets.append(row.lower()) for ind,tweet in enumerate(tweets): doc = nlp(unicode(tweet)) tweet_wordlist.append([]) for token in doc: if not token.like_url and not token.is_punct: #filter out urls and punctuations tweet_wordlist[ind].append(token.text) #both should be equal print len(tweets),len(tweet_wordlist) # In[63]: from nltk import bigrams, trigrams from collections import defaultdict #checking out stuff... first_sentence = tweet_wordlist[0] print first_sentence # Get the bigrams print list(bigrams(first_sentence)) # Get the padded bigrams print list(bigrams(first_sentence, pad_left=True, pad_right=True)) # Get the trigrams print list(trigrams(first_sentence)) # Get the padded trigrams print list(trigrams(first_sentence, pad_left=True, pad_right=True)) # In[110]: model = defaultdict(lambda: defaultdict(lambda: 0)) for tweet in tweet_wordlist: for w1, w2, w3 in trigrams(tweet, pad_right=True, pad_left=True): #print w1,w2,w3 model[(w1, w2)][w3] += 1 '''how many tweets start with 'you'?''' print model[None, None]["you"] # Let's transform the counts to probabilities. for w1_w2 in model: total_count = float(sum(model[w1_w2].values())) for w3 in model[w1_w2]: model[w1_w2][w3] /= total_count # In[100]: #ran repeatedly to generate 24 samples import random text = [None, None] sentence_finished = False while not sentence_finished: r = random.random() accumulator = .0 for word in model[tuple(text[-2:])].keys(): accumulator += model[tuple(text[-2:])][word] if accumulator >= r: text.append(word) break if text[-2:] == [None, None]: sentence_finished = True print ' '.join([t for t in text if t]) # # Results: Broad classification of Random 24 samples generated # # ### Type 1: Obvious tweets which actually make sense were generated :) # ``` # produced = original tweet1 + original tweet2 + ... # ``` # 1. **cows do not have a gender** = Cows do not meow + ketchup doesn't have a gender # 2. **birds like to smoke weed** = birds like to fly + Wiz Khalifa and Snoop Dogg like to smoke weed. # 3. **hot chocolate is not red** = Hot chocolate is not a chilled beverage.+ The color blue is not red. # 4. **butterflies are not good at math** = Butterflies are not made of butter + cows are not good at math # 5. **elephants are not a plant that grows eggs** = Elephants are not fish + An eggplant is not a plant that grows eggs. # 6. **you should not drink boiling water is very hot** = You should not drink boiling water + Studies show that fire is very hot. # 7. **sunscreen will not quench your thirst** = Sunscreen will not work if you eat it. + Eating sand will not quench your thirst. # 8. **butterflies are not good for your health** = Butterflies are not made of butter + Smoking crack is not good for your health. # 9. **earphones are n't the greatest present to give to a tree** = Earphones aren't the greatest present to give to a deaf person. + It is very difficult to staple water to a tree. # 10. **if you jump in front of a window you can jump in a burger king** = If you jump in front of a train, there's a good chance you'll die.+ If you walk into your home and it looks exactly like a Burger King, then you might be in a Burger King.''' # # Even some sarcastic taunts :) # # 11. **take your age and add 5 to it that is your exact age** = Take your age and add 5 to it. That is your age in 5 years. + Take your age and subtract 2 then add 2. That is your exact age. # # ### Type 2: 100% Original tweets regenerated # # 12. 100% of divorced men were once married # 13. sharks eat more people than potatoes do # 14. bottled water contains water # 15. do n't set your house on fire # # ### Type 3: Incoherent or little sensible tweets # 16. this is a great way to stop yourself from dying # 17. according to recent studies suggest that there are at least one friend # 18. if you pour water on the ground side by side you would be dead # 19. according to recent studies suggest that there are 4 letters of the time water will get you drunk # 20. age is not monday # 21. it is not actually mean there is a once in a six pack of beer # 22. breathing is something that you have wet hands # 23. apple does not actually made out of a window # 24. when you have taken a breath within the last three seconds you will probably hurt #