#!/usr/bin/env python
# coding: utf-8

# # Retweets and quotes
# Hereafter referring to retweets and quotes as retweets.

# ## Data prep

# ### Load the data and count.

# In[1]:


import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Simply the tweet on load
def retweet_transform(tweet):
    retweet = tweet.get('retweeted_status') or tweet.get('quoted_status')
    if retweet:
        return {
            'tweet_id': tweet['id_str'],
            'user_id': tweet['user']['id_str'],
            'screen_name': tweet['user']['screen_name'],
            'retweet_user_id': retweet['user']['id_str'],
            'retweet_screen_name': retweet['user']['screen_name'],
            'tweet_created_at': date_parse(tweet['created_at'])            
        }
    return None

retweet_df = load_tweet_df(retweet_transform, ['tweet_id', 'user_id', 'screen_name', 'retweet_user_id',
                                           'retweet_screen_name', 'tweet_created_at'])


# ### Number of retweets found in the dataset

# In[2]:


retweet_df[['retweet_user_id']].count()


# ### The retweet data
# Each retweet consists of the tweet id, the screen name and user id that is retweeting,
# and the screen_name and user_id that is retweeted.

# In[3]:


retweet_df.head()


# ### Create lookup of retweeted user ids to screen names

# In[4]:


# From the retweets, extract map of user ids to screen names
retweet_user_id_lookup_df = retweet_df.loc[retweet_df.groupby('retweet_user_id')['tweet_created_at'].idxmax()].ix[:,['retweet_user_id', 'retweet_screen_name']].set_index(['retweet_user_id'])
retweet_user_id_lookup_df.count()


# In[5]:


retweet_user_id_lookup_df.head()


# ### Create lookup of user ids to screen names

# In[6]:


# From the users (not the retweets), extract map of user ids to screen names
user_id_lookup_df = retweet_df.loc[retweet_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id'])
user_id_lookup_df.count()


# ### Group reteeet by retweeted user id

# In[7]:


# Group by user_id
# This count should match the user_id map count
retweet_summary_user_id_df = pd.DataFrame(retweet_df.groupby('retweet_user_id').size(), columns=['retweet_count'])
retweet_summary_user_id_df.count()


# In[8]:


retweet_summary_user_id_df.head()


# ### Add back in the retweet screen names

# In[9]:


# Join with user id map
retweet_summary_screen_name_df = retweet_summary_user_id_df.join(retweet_user_id_lookup_df)
retweet_summary_screen_name_df.count()


# In[10]:


retweet_summary_screen_name_df.head()

### Add users types for retweets
# In[11]:


# Load lookups of known users
from utils import load_user_type_lookup_df

user_type_lookup_df = load_user_type_lookup_df()[['type']]
user_type_lookup_df.count()


# In[12]:


user_type_lookup_df.head()


# In[13]:


user_type_lookup_df['type'].value_counts()


# In[14]:


# Join the retweets and the known users
retweet_summary_type_df = retweet_summary_screen_name_df.join(user_type_lookup_df, how='left')
retweet_summary_type_df['type'].fillna('unknown', inplace=True)
retweet_summary_type_df.index.name = 'user_id'
retweet_summary_type_df.count()


# In[15]:


retweet_summary_type_df.head()


# ### Add number of users retweeting
# Which is different than the number of retweets.

# In[16]:


retweet_user_id_per_user_df = retweet_df[['retweet_user_id', 'user_id']].drop_duplicates()
retweet_user_id_per_user_summary_df = pd.DataFrame(retweet_user_id_per_user_df.groupby('retweet_user_id').size(), columns=['users_retweeting_count'])
retweet_user_id_per_user_summary_df.index.name = 'user_id'
retweet_user_id_per_user_summary_df.head()
# Join with retweet_summary_type_df
retweet_summary_df = retweet_summary_type_df.join(retweet_user_id_per_user_summary_df)
retweet_summary_df['percent_of_users_retweeting'] = retweet_summary_df.users_retweeting_count / user_id_lookup_df['screen_name'].count()
retweet_summary_df.head()


# ## Retweet summary

# ### Retweets per user
# For users that made any retweets. Also to possible to figure this out for all users.

# In[17]:


retweet_df['user_id'].value_counts().describe()


# ### How long is the tail?

# In[18]:


retweet_grouped_by_users_retweeting_df = retweet_summary_df[['retweet_count', 'users_retweeting_count']].groupby(by='users_retweeting_count').agg([np.sum, np.size])
retweet_grouped_by_users_retweeting_df['cumulative_retweet_count_sum'] = retweet_grouped_by_users_retweeting_df['retweet_count', 'sum'].cumsum()
retweet_grouped_by_users_retweeting_df['cumulative_retweet_count_sum_percentage'] = retweet_grouped_by_users_retweeting_df['cumulative_retweet_count_sum'] / retweet_grouped_by_users_retweeting_df['retweet_count', 'sum'].sum()
retweet_grouped_by_users_retweeting_df['cumulative_retweeted_users'] = retweet_grouped_by_users_retweeting_df['retweet_count', 'size'].cumsum()
retweet_grouped_by_users_retweeting_df['cumulative_retweeted_users_percentage'] = retweet_grouped_by_users_retweeting_df['cumulative_retweeted_users'] / retweet_grouped_by_users_retweeting_df['retweet_count', 'size'].sum()
retweet_grouped_by_users_retweeting_df


# In[19]:


get_ipython().run_line_magic('matplotlib', 'inline')
retweet_grouped_by_users_retweeting_df[['cumulative_retweet_count_sum_percentage', 'cumulative_retweeted_users_percentage']].plot()


# ## Cut off the tail.
# Removes users that were only retweeted by 5 or less users.

# In[20]:


retweet_summary_df.drop(retweet_summary_df[retweet_summary_df.users_retweeting_count <= 5].index, inplace=True)
retweet_summary_df['retweet_screen_name'].count()


# ## Approach 1: By retweet count

# ### Top accounts (by retweet count)
# Unknown for type indicates that it is not matched with an known Twitter account.

# In[21]:


retweet_summary_df.sort_values('retweet_count', ascending=False).head(50)


# ### Account types (by retweet count)

# In[22]:


types_by_retweet_count_df = retweet_summary_df[['type', 'retweet_count']].groupby('type').sum()
types_by_retweet_count_df['type_percentage']= types_by_retweet_count_df['retweet_count'] / types_by_retweet_count_df['retweet_count'].sum()
types_by_retweet_count_df.sort_values('retweet_count', ascending=False)


# ## Approach 2: Per user
# Retweets by type per user.

# ### Add type by merging screen name lookup

# In[23]:


retweet_all_join_df = pd.merge(retweet_df, user_type_lookup_df[['type']], how='left', left_on='retweet_user_id', right_index=True)
retweet_all_join_df['type'].fillna('unknown', inplace=True)
# Drop tail
retweet_all_join_limited_df = retweet_all_join_df[retweet_all_join_df.retweet_user_id.isin(retweet_summary_df.index)]
retweet_all_join_limited_df.head()


# In[24]:


retweet_summary_by_user_df = retweet_all_join_limited_df.groupby([retweet_all_join_limited_df.user_id, retweet_all_join_limited_df.type]).size().unstack().fillna(0)
# Add a total column
retweet_summary_by_user_df['total'] = retweet_summary_by_user_df.sum(axis=1)
for col_name in retweet_summary_by_user_df.columns[:-1]:
    retweet_summary_by_user_df['{}_percent'.format(col_name)] = retweet_summary_by_user_df[col_name] / retweet_summary_by_user_df.total
retweet_summary_by_user_df.head(10)


# ### Average of percent of retweets by type for each user
# That is, for each user determine the percent of retweets by type. Then take the average of each type.
# 
# Thus, this retweet analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.)
# 

# In[25]:


retweet_summary_by_user_df.filter(axis=1, regex="_percent$").mean()


# ## Approach 3: By count of users retweeting
# The number of users that retweeted an account. Thus, each user counts as 1, even if that user made multiple retweets of the account.
# 
# This weights an account that is retweeted by a 100 users more heavily than an account that is retweeted a 100 times by a single user.

# In[26]:


retweet_summary_df.sort_values('users_retweeting_count', ascending=False).head(20)


# ### Account types (by count of users retweeting)

# In[27]:


types_by_users_retweeting_df = retweet_summary_df[['type', 'users_retweeting_count']].groupby('type').sum()
types_by_users_retweeting_df['type_percentage']= types_by_users_retweeting_df['users_retweeting_count'] / types_by_users_retweeting_df['users_retweeting_count'].sum()
types_by_users_retweeting_df.sort_values('users_retweeting_count', ascending=False)


# ## Unknown accounts
# Remember, the tail has been cut off

# ### Number of unknown accounts

# In[28]:


retweet_summary_df[retweet_summary_df.type == 'unknown'].count()


# ### Number of known accounts

# In[29]:


retweet_summary_df[retweet_summary_df.type != 'unknown'].count()


# ### Top unknown by retweet count that are retweeted by at least 5 users

# In[32]:


top_not_known_retweet_df = retweet_summary_df[(retweet_summary_df.type == 'unknown') & (retweet_summary_df.users_retweeting_count >= 5)].sort_values('retweet_count', ascending=False)[['retweet_screen_name', 'retweet_count', 'users_retweeting_count']]
top_not_known_retweet_df.head(50)


# ### Write top accounts to file

# In[33]:


top_not_known_retweet_df.to_csv('unknown_retweets.csv')


# In[ ]: