%%bash

pip install twitter pandas nltk

import io
import json
import twitter

# XXX: Go to http://twitter.com/apps/new to create an app and get values
# for these credentials that you'll need to provide in place of these
# empty string values that are defined as placeholders.
#
# See https://vimeo.com/79220146 for a short video that steps you
# through this process
#
# See https://dev.twitter.com/docs/auth/oauth for more information 
# on Twitter's OAuth implementation.

CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''

# The keyword query

QUERY = 'Amazon'

# The file to write output as newline-delimited JSON documents
OUT_FILE = QUERY + ".json"


# Authenticate to Twitter with OAuth

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                           CONSUMER_KEY, CONSUMER_SECRET)

# Create a connection to the Streaming API

twitter_stream = twitter.TwitterStream(auth=auth)


print 'Filtering the public timeline for "{0}"'.format(QUERY)

# See https://dev.twitter.com/docs/streaming-apis on keyword parameters

stream = twitter_stream.statuses.filter(track=QUERY)

# Write one tweet per line as a JSON document. 

with io.open(OUT_FILE, 'w', encoding='utf-8', buffering=1) as f:
    for tweet in stream:
        f.write(unicode(u'{0}\n'.format(json.dumps(tweet, ensure_ascii=False))))
        print tweet['text']

import pandas as pd

# A text file with one tweet per line

DATA_FILE = "tmp/Amazon.json"

# Build a JSON array

data = "[{0}]".format(",".join([l for l in open(DATA_FILE).readlines()]))

# Create a pandas DataFrame (think: 2-dimensional table) to get a 
# spreadsheet-like interface into the data

df = pd.read_json(data, orient='records')

print "Successfully imported", len(df), "tweets"

# Printing a DataFrame shows how pandas exposes a columnar view of the data

print df

# Observe the "limit" field that reflects "limit notices" where the streaming API
# couldn't return more than 1% of the firehose.
# See https://dev.twitter.com/docs/streaming-apis/messages#Limit_notices_limit

# Capture the limit notices by indexing into the data frame for non-null field
# containing "limit"

limit_notices = df[pd.notnull(df.limit)]

# Remove the limit notice column from the DataFrame entirely

df = df[pd.notnull(df['id'])]

print "Number of total tweets that were rate-limited", sum([ln['track'] for ln in limit_notices.limit])
print "Total number of limit notices", len(limit_notices)

# Create a time-based index on the tweets for time series analysis
# on the created_at field of the existing DataFrame.

df.set_index('created_at', drop=False, inplace=True)

print "Created date/time index on tweets"

# Get a sense of the time range for the data

print "First tweet timestamp (UTC)", df['created_at'][0]
print "Last tweet timestamp (UTC) ", df['created_at'][-1]

# Let's group the tweets by hour and look at the overall volumes with a simple
# text-based histogram

# First group by the hour

grouped = df.groupby(lambda x: x.hour)

print "Number of relevant tweets by the hour (UTC)"
print

# You can iterate over the groups and print 
# out the volume of tweets for each hour 
# along with a simple text-based histogram

for hour, group in grouped:
    print hour, len(group), '*'*(len(group) / 1000)

# Let's group the tweets by (hour, minute) and look at the overall volumes with a simple
# text-based histogram

def group_by_15_min_intervals(x):
    if   0 <= x.minute <= 15: return (x.hour, "0-15")
    elif 15 < x.minute <= 30: return (x.hour, "16-30")
    elif 30 < x.minute <= 45: return (x.hour, "31-45")
    else: return (x.hour, "46-00")


grouped = df.groupby(lambda x: group_by_15_min_intervals(x))

print "Number of relevant tweets by intervals (UTC)"
print

for interval, group in grouped:
    print interval, len(group), "\t", '*'*(len(group) / 200)

# Since we didn't start or end precisely on an interval, let's
# slice off the extremes. This has the added benefit of also
# improving the resolution of the plot that shows the trend
plt.plot([len(group) for hour, group in grouped][1:-1])
plt.ylabel("Tweet Volume")
plt.xlabel("Time")

from collections import Counter

# The "user" field is a record (dictionary), and we can pop it off
# and then use the Series constructor to make it easy to use with pandas.

user_col = df.pop('user').apply(pd.Series)

# Get the screen name column
authors = user_col.screen_name

# And count things
authors_counter = Counter(authors.values)

# And tally the totals

print
print "Most frequent (top 25) authors of tweets"
print '\n'.join(["{0}\t{1}".format(a, f) for a, f in authors_counter.most_common(25)])
print

# Get only the unique authors

num_unique_authors = len(set(authors.values))
print "There are {0} unique authors out of {1} tweets".format(num_unique_authors, len(df))

# Plot by rank (sorted value) to gain intution about the shape of the distrubtion

author_freqs = sorted(authors_counter.values())

plt.loglog(author_freqs)
plt.ylabel("Num Tweets by Author")
plt.xlabel("Author Rank")

# Start  a new figure

plt.figure()

# Plot a histogram to "zoom in" and increase resolution.

plt.hist(author_freqs, log=True)
plt.ylabel("Num Authors")
plt.xlabel("Num Tweets")

# What languages do authors of tweets speak? This might be a useful clue
# as to who is tweeting. (Also bear in mind the general timeframe for the 
# data when interpreting these results.)

df.lang.value_counts()

# Let's just look at the content of the English tweets by extracting it
# out as a list of text

en_text = df[df['lang'] == 'en'].pop('text')

from collections import Counter

tokens = []
for txt in en_text.values:
    tokens.extend([t.lower().strip(":,.") for t in txt.split()])
    
# Use a Counter to construct frequency tuples
tokens_counter = Counter(tokens)

# Display some of the most commonly occurring tokens
tokens_counter.most_common(50)

import nltk

# Download the stopwords list into NLTK

nltk.download('stopwords')

# Remove stopwords to decrease noise
for t in nltk.corpus.stopwords.words('english'):
    tokens_counter.pop(t)
    
# Redisplay the data (and then some)
tokens_counter.most_common(200)

nltk_text = nltk.Text(tokens)
nltk_text.collocations()

nltk_text.concordance("amazing")
print
nltk_text.concordance("holy")

# An crude look at tweet entities

entities = []
for txt in en_text.values:
    for t in txt.split():
        if t.startswith("http") or t.startswith("@") or t.startswith("#") or t.startswith("RT @"):
            if not t.startswith("http"):
                t = t.lower()
            entities.append(t.strip(" :,"))

entities_counter = Counter(entities)
for entity, freq in entities_counter.most_common()[:100]:
    print entity, freq