#!/usr/bin/env python # coding: utf-8 # # Tweet summary # ## Prepare the tweet data # ### Load the tweets # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import logging from dateutil.parser import parse as date_parse from utils import load_tweet_df, tweet_type import matplotlib.pyplot as plt logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Set float format so doesn't display scientific notation pd.options.display.float_format = '{:20,.2f}'.format def tweet_transform(tweet): return { 'tweet_id': tweet['id_str'], 'tweet_created_at': date_parse(tweet['created_at']), 'user_id': tweet['user']['id_str'], 'screen_name': tweet['user']['screen_name'], 'user_created_at': date_parse(tweet['user']['created_at']), 'tweets_to_date': tweet['user']['statuses_count'], 'tweet_type': tweet_type(tweet) } tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'user_created_at', 'tweets_to_date', 'tweet_type']) tweet_df.count() # ### View the top of the data. # In[2]: tweet_df.head() # ## Prepare the user data # ### Tweets in dataset for each user # In[3]: user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack() user_tweet_count_df.fillna(0, inplace=True) user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet user_tweet_count_df['tweets_in_dataset_bin'] = pd.qcut(user_tweet_count_df.tweets_in_dataset, [0, .9, .99, 1.], labels=['Bottom 90%', 'Middle 9%', 'Top 1%']) user_tweet_count_df.head() # ### Load and join user info # This is information that was coded in the spreadsheet or looked up for each user via API. # In[4]: user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['screen_name', 'user_id', 'name', 'organization', 'position', 'gender', 'followers_count', 'following_count', 'tweet_count', 'user_created_at', 'verified', 'protected'], dtype={'user_id': str}).set_index(['user_id']) user_info_df.count() # In[5]: user_info_df.head() # In[6]: # Join user_summary_df = user_info_df.join(user_tweet_count_df, how='left') # Fill Nans user_summary_df['organization'].fillna('', inplace=True) user_summary_df['original'].fillna(0, inplace=True) user_summary_df['quote'].fillna(0, inplace=True) user_summary_df['reply'].fillna(0, inplace=True) user_summary_df['retweet'].fillna(0, inplace=True) user_summary_df['tweets_in_dataset'].fillna(0, inplace=True) user_summary_df.count() # In[7]: user_summary_df.head() # ### Write to file as output/user_summary.csv # In[8]: user_summary_df.to_csv('output/user_summary.csv') # ## Prepare the organization data # This is for users that are members of each organization. # In[9]: org_summary_df = user_summary_df[['organization', 'followers_count', 'following_count', 'tweet_count', 'tweets_in_dataset']].groupby('organization').agg([np.sum, np.size, np.average]) org_summary_df.count() # In[10]: org_summary_df.head() # ### Write to file as output/organization_summary.csv # In[12]: org_summary_df.to_csv('output/organization_summary.csv') # ### List of organizations <--- This probably requires some cleanup # In[13]: org_summary_df.index.tolist() # ## Tweet summary # For tweets in dataset. # ### Types of tweets # In[14]: tweet_df['tweet_type'].value_counts() # ## User tweet summary # ### Types of tweets in dataset for each user # In[15]: user_summary_df[['original', 'quote', 'reply', 'retweet']].describe() # ### 1/9/90 rule # For top 1%, 9%, 90% of tweeters, the number of tweets and types of tweets they account for. # In[16]: user_summary_df[user_summary_df.tweets_in_dataset_bin == 'Top 1%'] # In[17]: tweets_in_dataset_bin_summary_df = user_summary_df[['original', 'quote', 'reply', 'retweet', 'tweets_in_dataset', 'tweets_in_dataset_bin']].groupby('tweets_in_dataset_bin').sum() tweets_in_dataset_bin_summary_df['percent_of_original'] = tweets_in_dataset_bin_summary_df.original / tweets_in_dataset_bin_summary_df.original.sum() tweets_in_dataset_bin_summary_df['percent_of_quote'] = tweets_in_dataset_bin_summary_df.quote / tweets_in_dataset_bin_summary_df.quote.sum() tweets_in_dataset_bin_summary_df['percent_of_reply'] = tweets_in_dataset_bin_summary_df.reply / tweets_in_dataset_bin_summary_df.reply.sum() tweets_in_dataset_bin_summary_df['percent_of_retweets'] = tweets_in_dataset_bin_summary_df.retweet / tweets_in_dataset_bin_summary_df.retweet.sum() tweets_in_dataset_bin_summary_df['percent_of_tweets_in_dataset'] = tweets_in_dataset_bin_summary_df.tweets_in_dataset / tweets_in_dataset_bin_summary_df.tweets_in_dataset.sum() tweets_in_dataset_bin_summary_df['users_in_bin'] = user_summary_df[['tweets_in_dataset_bin', 'tweets_in_dataset']].groupby('tweets_in_dataset_bin').count() tweets_in_dataset_bin_summary_df # ## User summary # In[18]: user_summary_df[['followers_count', 'following_count', 'tweet_count']].describe() # ### Gender # In[19]: user_summary_df['gender'].value_counts() # ## Organization # ### Top by average followers # In[20]: org_summary_df[['followers_count']].sort_values([('followers_count', 'average')], ascending=False).head() # ### Top by average following # In[21]: org_summary_df[['following_count']].sort_values([('following_count', 'average')], ascending=False).head() # ### Top by average tweet count # In[22]: org_summary_df[['tweet_count']].sort_values([('tweet_count', 'average')], ascending=False).head() # ### Top by number of tweets in dataset # In[23]: org_summary_df[['tweets_in_dataset']].sort_values([('tweets_in_dataset', 'sum')], ascending=False).head() # ## First tweet for each user # In[24]: # Get the first tweet for each user first_tweet_df = tweet_df.loc[tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id']) first_tweet_df.count() # In[25]: first_tweet_df.sort_values('tweet_created_at', ascending=False).head() # ### Most recent first tweet # In[26]: first_tweet_df['tweet_created_at'].max()