#!/usr/bin/env python # coding: utf-8 # # Retweets and quotes # Hereafter referring to retweets and quotes as retweets. # ## Data prep # ### Load the data and count. # In[1]: import pandas as pd import numpy as np import logging from dateutil.parser import parse as date_parse from utils import load_tweet_df, tweet_type logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Simply the tweet on load def retweet_transform(tweet): retweet = tweet.get('retweeted_status') or tweet.get('quoted_status') if retweet: return { 'tweet_id': tweet['id_str'], 'user_id': tweet['user']['id_str'], 'screen_name': tweet['user']['screen_name'], 'retweet_user_id': retweet['user']['id_str'], 'retweet_screen_name': retweet['user']['screen_name'], 'tweet_created_at': date_parse(tweet['created_at']) } return None retweet_df = load_tweet_df(retweet_transform, ['tweet_id', 'user_id', 'screen_name', 'retweet_user_id', 'retweet_screen_name', 'tweet_created_at']) # ### Number of retweets found in the dataset # In[2]: retweet_df[['retweet_user_id']].count() # ### The retweet data # Each retweet consists of the tweet id, the screen name and user id that is retweeting, # and the screen_name and user_id that is retweeted. # In[3]: retweet_df.head() # ### Create lookup of retweeted user ids to screen names # In[4]: # From the retweets, extract map of user ids to screen names retweet_user_id_lookup_df = retweet_df.loc[retweet_df.groupby('retweet_user_id')['tweet_created_at'].idxmax()].ix[:,['retweet_user_id', 'retweet_screen_name']].set_index(['retweet_user_id']) retweet_user_id_lookup_df.count() # In[5]: retweet_user_id_lookup_df.head() # ### Create lookup of user ids to screen names # In[6]: # From the users (not the retweets), extract map of user ids to screen names user_id_lookup_df = retweet_df.loc[retweet_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id']) user_id_lookup_df.count() # ### Group reteeet by retweeted user id # In[7]: # Group by user_id # This count should match the user_id map count retweet_summary_user_id_df = pd.DataFrame(retweet_df.groupby('retweet_user_id').size(), columns=['retweet_count']) retweet_summary_user_id_df.count() # In[8]: retweet_summary_user_id_df.head() # ### Add back in the retweet screen names # In[9]: # Join with user id map retweet_summary_screen_name_df = retweet_summary_user_id_df.join(retweet_user_id_lookup_df) retweet_summary_screen_name_df.count() # In[10]: retweet_summary_screen_name_df.head() ### Add users types for retweets # In[11]: # Load lookups of known users from utils import load_user_type_lookup_df user_type_lookup_df = load_user_type_lookup_df()[['type']] user_type_lookup_df.count() # In[12]: user_type_lookup_df.head() # In[13]: user_type_lookup_df['type'].value_counts() # In[14]: # Join the retweets and the known users retweet_summary_type_df = retweet_summary_screen_name_df.join(user_type_lookup_df, how='left') retweet_summary_type_df['type'].fillna('unknown', inplace=True) retweet_summary_type_df.index.name = 'user_id' retweet_summary_type_df.count() # In[15]: retweet_summary_type_df.head() # ### Add number of users retweeting # Which is different than the number of retweets. # In[16]: retweet_user_id_per_user_df = retweet_df[['retweet_user_id', 'user_id']].drop_duplicates() retweet_user_id_per_user_summary_df = pd.DataFrame(retweet_user_id_per_user_df.groupby('retweet_user_id').size(), columns=['users_retweeting_count']) retweet_user_id_per_user_summary_df.index.name = 'user_id' retweet_user_id_per_user_summary_df.head() # Join with retweet_summary_type_df retweet_summary_df = retweet_summary_type_df.join(retweet_user_id_per_user_summary_df) retweet_summary_df['percent_of_users_retweeting'] = retweet_summary_df.users_retweeting_count / user_id_lookup_df['screen_name'].count() retweet_summary_df.head() # ## Retweet summary # ### Retweets per user # For users that made any retweets. Also to possible to figure this out for all users. # In[17]: retweet_df['user_id'].value_counts().describe() # ### How long is the tail? # In[18]: retweet_grouped_by_users_retweeting_df = retweet_summary_df[['retweet_count', 'users_retweeting_count']].groupby(by='users_retweeting_count').agg([np.sum, np.size]) retweet_grouped_by_users_retweeting_df['cumulative_retweet_count_sum'] = retweet_grouped_by_users_retweeting_df['retweet_count', 'sum'].cumsum() retweet_grouped_by_users_retweeting_df['cumulative_retweet_count_sum_percentage'] = retweet_grouped_by_users_retweeting_df['cumulative_retweet_count_sum'] / retweet_grouped_by_users_retweeting_df['retweet_count', 'sum'].sum() retweet_grouped_by_users_retweeting_df['cumulative_retweeted_users'] = retweet_grouped_by_users_retweeting_df['retweet_count', 'size'].cumsum() retweet_grouped_by_users_retweeting_df['cumulative_retweeted_users_percentage'] = retweet_grouped_by_users_retweeting_df['cumulative_retweeted_users'] / retweet_grouped_by_users_retweeting_df['retweet_count', 'size'].sum() retweet_grouped_by_users_retweeting_df # In[19]: get_ipython().run_line_magic('matplotlib', 'inline') retweet_grouped_by_users_retweeting_df[['cumulative_retweet_count_sum_percentage', 'cumulative_retweeted_users_percentage']].plot() # ## Cut off the tail. # Removes users that were only retweeted by 5 or less users. # In[20]: retweet_summary_df.drop(retweet_summary_df[retweet_summary_df.users_retweeting_count <= 5].index, inplace=True) retweet_summary_df['retweet_screen_name'].count() # ## Approach 1: By retweet count # ### Top accounts (by retweet count) # Unknown for type indicates that it is not matched with an known Twitter account. # In[21]: retweet_summary_df.sort_values('retweet_count', ascending=False).head(50) # ### Account types (by retweet count) # In[22]: types_by_retweet_count_df = retweet_summary_df[['type', 'retweet_count']].groupby('type').sum() types_by_retweet_count_df['type_percentage']= types_by_retweet_count_df['retweet_count'] / types_by_retweet_count_df['retweet_count'].sum() types_by_retweet_count_df.sort_values('retweet_count', ascending=False) # ## Approach 2: Per user # Retweets by type per user. # ### Add type by merging screen name lookup # In[23]: retweet_all_join_df = pd.merge(retweet_df, user_type_lookup_df[['type']], how='left', left_on='retweet_user_id', right_index=True) retweet_all_join_df['type'].fillna('unknown', inplace=True) # Drop tail retweet_all_join_limited_df = retweet_all_join_df[retweet_all_join_df.retweet_user_id.isin(retweet_summary_df.index)] retweet_all_join_limited_df.head() # In[24]: retweet_summary_by_user_df = retweet_all_join_limited_df.groupby([retweet_all_join_limited_df.user_id, retweet_all_join_limited_df.type]).size().unstack().fillna(0) # Add a total column retweet_summary_by_user_df['total'] = retweet_summary_by_user_df.sum(axis=1) for col_name in retweet_summary_by_user_df.columns[:-1]: retweet_summary_by_user_df['{}_percent'.format(col_name)] = retweet_summary_by_user_df[col_name] / retweet_summary_by_user_df.total retweet_summary_by_user_df.head(10) # ### Average of percent of retweets by type for each user # That is, for each user determine the percent of retweets by type. Then take the average of each type. # # Thus, this retweet analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.) # # In[25]: retweet_summary_by_user_df.filter(axis=1, regex="_percent$").mean() # ## Approach 3: By count of users retweeting # The number of users that retweeted an account. Thus, each user counts as 1, even if that user made multiple retweets of the account. # # This weights an account that is retweeted by a 100 users more heavily than an account that is retweeted a 100 times by a single user. # In[26]: retweet_summary_df.sort_values('users_retweeting_count', ascending=False).head(20) # ### Account types (by count of users retweeting) # In[27]: types_by_users_retweeting_df = retweet_summary_df[['type', 'users_retweeting_count']].groupby('type').sum() types_by_users_retweeting_df['type_percentage']= types_by_users_retweeting_df['users_retweeting_count'] / types_by_users_retweeting_df['users_retweeting_count'].sum() types_by_users_retweeting_df.sort_values('users_retweeting_count', ascending=False) # ## Unknown accounts # Remember, the tail has been cut off # ### Number of unknown accounts # In[28]: retweet_summary_df[retweet_summary_df.type == 'unknown'].count() # ### Number of known accounts # In[29]: retweet_summary_df[retweet_summary_df.type != 'unknown'].count() # ### Top unknown by retweet count that are retweeted by at least 5 users # In[32]: top_not_known_retweet_df = retweet_summary_df[(retweet_summary_df.type == 'unknown') & (retweet_summary_df.users_retweeting_count >= 5)].sort_values('retweet_count', ascending=False)[['retweet_screen_name', 'retweet_count', 'users_retweeting_count']] top_not_known_retweet_df.head(50) # ### Write top accounts to file # In[33]: top_not_known_retweet_df.to_csv('unknown_retweets.csv') # In[ ]: