%%bash pip install twitter pandas nltk import io import json import twitter # XXX: Go to http://twitter.com/apps/new to create an app and get values # for these credentials that you'll need to provide in place of these # empty string values that are defined as placeholders. # # See https://vimeo.com/79220146 for a short video that steps you # through this process # # See https://dev.twitter.com/docs/auth/oauth for more information # on Twitter's OAuth implementation. CONSUMER_KEY = '' CONSUMER_SECRET = '' OAUTH_TOKEN = '' OAUTH_TOKEN_SECRET = '' # The keyword query QUERY = 'Amazon' # The file to write output as newline-delimited JSON documents OUT_FILE = QUERY + ".json" # Authenticate to Twitter with OAuth auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET) # Create a connection to the Streaming API twitter_stream = twitter.TwitterStream(auth=auth) print 'Filtering the public timeline for "{0}"'.format(QUERY) # See https://dev.twitter.com/docs/streaming-apis on keyword parameters stream = twitter_stream.statuses.filter(track=QUERY) # Write one tweet per line as a JSON document. with io.open(OUT_FILE, 'w', encoding='utf-8', buffering=1) as f: for tweet in stream: f.write(unicode(u'{0}\n'.format(json.dumps(tweet, ensure_ascii=False)))) print tweet['text'] import pandas as pd # A text file with one tweet per line DATA_FILE = "tmp/Amazon.json" # Build a JSON array data = "[{0}]".format(",".join([l for l in open(DATA_FILE).readlines()])) # Create a pandas DataFrame (think: 2-dimensional table) to get a # spreadsheet-like interface into the data df = pd.read_json(data, orient='records') print "Successfully imported", len(df), "tweets" # Printing a DataFrame shows how pandas exposes a columnar view of the data print df # Observe the "limit" field that reflects "limit notices" where the streaming API # couldn't return more than 1% of the firehose. # See https://dev.twitter.com/docs/streaming-apis/messages#Limit_notices_limit # Capture the limit notices by indexing into the data frame for non-null field # containing "limit" limit_notices = df[pd.notnull(df.limit)] # Remove the limit notice column from the DataFrame entirely df = df[pd.notnull(df['id'])] print "Number of total tweets that were rate-limited", sum([ln['track'] for ln in limit_notices.limit]) print "Total number of limit notices", len(limit_notices) # Create a time-based index on the tweets for time series analysis # on the created_at field of the existing DataFrame. df.set_index('created_at', drop=False, inplace=True) print "Created date/time index on tweets" # Get a sense of the time range for the data print "First tweet timestamp (UTC)", df['created_at'][0] print "Last tweet timestamp (UTC) ", df['created_at'][-1] # Let's group the tweets by hour and look at the overall volumes with a simple # text-based histogram # First group by the hour grouped = df.groupby(lambda x: x.hour) print "Number of relevant tweets by the hour (UTC)" print # You can iterate over the groups and print # out the volume of tweets for each hour # along with a simple text-based histogram for hour, group in grouped: print hour, len(group), '*'*(len(group) / 1000) # Let's group the tweets by (hour, minute) and look at the overall volumes with a simple # text-based histogram def group_by_15_min_intervals(x): if 0 <= x.minute <= 15: return (x.hour, "0-15") elif 15 < x.minute <= 30: return (x.hour, "16-30") elif 30 < x.minute <= 45: return (x.hour, "31-45") else: return (x.hour, "46-00") grouped = df.groupby(lambda x: group_by_15_min_intervals(x)) print "Number of relevant tweets by intervals (UTC)" print for interval, group in grouped: print interval, len(group), "\t", '*'*(len(group) / 200) # Since we didn't start or end precisely on an interval, let's # slice off the extremes. This has the added benefit of also # improving the resolution of the plot that shows the trend plt.plot([len(group) for hour, group in grouped][1:-1]) plt.ylabel("Tweet Volume") plt.xlabel("Time") from collections import Counter # The "user" field is a record (dictionary), and we can pop it off # and then use the Series constructor to make it easy to use with pandas. user_col = df.pop('user').apply(pd.Series) # Get the screen name column authors = user_col.screen_name # And count things authors_counter = Counter(authors.values) # And tally the totals print print "Most frequent (top 25) authors of tweets" print '\n'.join(["{0}\t{1}".format(a, f) for a, f in authors_counter.most_common(25)]) print # Get only the unique authors num_unique_authors = len(set(authors.values)) print "There are {0} unique authors out of {1} tweets".format(num_unique_authors, len(df)) # Plot by rank (sorted value) to gain intution about the shape of the distrubtion author_freqs = sorted(authors_counter.values()) plt.loglog(author_freqs) plt.ylabel("Num Tweets by Author") plt.xlabel("Author Rank") # Start a new figure plt.figure() # Plot a histogram to "zoom in" and increase resolution. plt.hist(author_freqs, log=True) plt.ylabel("Num Authors") plt.xlabel("Num Tweets") # What languages do authors of tweets speak? This might be a useful clue # as to who is tweeting. (Also bear in mind the general timeframe for the # data when interpreting these results.) df.lang.value_counts() # Let's just look at the content of the English tweets by extracting it # out as a list of text en_text = df[df['lang'] == 'en'].pop('text') from collections import Counter tokens = [] for txt in en_text.values: tokens.extend([t.lower().strip(":,.") for t in txt.split()]) # Use a Counter to construct frequency tuples tokens_counter = Counter(tokens) # Display some of the most commonly occurring tokens tokens_counter.most_common(50) import nltk # Download the stopwords list into NLTK nltk.download('stopwords') # Remove stopwords to decrease noise for t in nltk.corpus.stopwords.words('english'): tokens_counter.pop(t) # Redisplay the data (and then some) tokens_counter.most_common(200) nltk_text = nltk.Text(tokens) nltk_text.collocations() nltk_text.concordance("amazing") print nltk_text.concordance("holy") # An crude look at tweet entities entities = [] for txt in en_text.values: for t in txt.split(): if t.startswith("http") or t.startswith("@") or t.startswith("#") or t.startswith("RT @"): if not t.startswith("http"): t = t.lower() entities.append(t.strip(" :,")) entities_counter = Counter(entities) for entity, freq in entities_counter.most_common()[:100]: print entity, freq