#!/usr/bin/env python # coding: utf-8 #

Table of Contents

#
#

Table of Contents

#
# # Gender dynamics # ## Tweet data prep # ### Load the tweets # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import logging from dateutil.parser import parse as date_parse from utils import load_tweet_df, tweet_type import matplotlib.pyplot as plt logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Set float format so doesn't display scientific notation pd.options.display.float_format = '{:20,.2f}'.format def tweet_transform(tweet): return { 'tweet_id': tweet['id_str'], 'tweet_created_at': date_parse(tweet['created_at']), 'user_id': tweet['user']['id_str'], 'screen_name': tweet['user']['screen_name'], 'tweet_type': tweet_type(tweet) } tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'tweet_type'], dedupe_columns=['tweet_id']) tweet_df.count() # In[2]: tweet_df.head() # ## Tweet analysis # ### What are the first and last tweets in the dataset? # In[3]: tweet_df.tweet_created_at.min() # In[4]: tweet_df.tweet_created_at.max() # ### How many retweets, original tweets, replies, and quotes are in dataset? # In[5]: pd.DataFrame({'count':tweet_df.tweet_type.value_counts(), 'percentage':tweet_df.tweet_type.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) # ## Tweeter data prep # This comes from the following sources: # 1. User lookup: These are lists of users exported from SFM. These are the final set of beltway journalists. Accounts that were suspended or deleted have been removed from this list. Also, this list will include users that did not tweet (i.e., have no tweets in dataset). # 2. Tweets in the dataset: Used to generate tweet counts per tweeter. However, since some beltway journalists may not have tweeted, this may be a subset of the user lookup. Also, it may include the tweets of some users that were later excluded because their accounts were suspended or deleted or determined to not be beltway journalists. # 3. User info lookup: Information on users that was manually coded in the beltway journalist spreadsheet or looked up from Twitter's API. This includes some accounts that were excluded from data collection for various reasons such as working for a foreign news organization or no longer working as a beltway journalist. Thus, these are a superset of the user lookup. # # Thus, the tweeter data should include tweet and user info data only from users in the user lookup. # ### Load user lookup # In[6]: user_lookup_filepaths = ('lookups/senate_press_lookup.csv', 'lookups/periodical_press_lookup.csv', 'lookups/radio_and_television_lookup.csv') user_lookup_df = pd.concat((pd.read_csv(user_lookup_filepath, usecols=['Uid', 'Token'], dtype={'Uid': str}) for user_lookup_filepath in user_lookup_filepaths)) user_lookup_df.set_index('Uid', inplace=True) user_lookup_df.rename(columns={'Token': 'screen_name'}, inplace=True) user_lookup_df.index.names = ['user_id'] # Some users may be in multiple lists, so need to drop duplicates user_lookup_df = user_lookup_df[~user_lookup_df.index.duplicated()] user_lookup_df.count() # In[7]: user_lookup_df.head() # ### Tweets in dataset per tweeter # In[8]: user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack() user_tweet_count_df.fillna(0, inplace=True) user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet user_tweet_count_df.count() # In[9]: user_tweet_count_df.head() # ### Load user info # In[10]: user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['user_id', 'name', 'organization', 'position', 'gender', 'followers_count', 'following_count', 'tweet_count', 'user_created_at', 'verified', 'protected'], dtype={'user_id': str}).set_index(['user_id']) user_info_df.count() # In[11]: user_info_df.head() # In[12]: user_summary_df = user_lookup_df.join((user_info_df, user_tweet_count_df), how='left') # Fill Nans user_summary_df['organization'].fillna('', inplace=True) user_summary_df['original'].fillna(0, inplace=True) user_summary_df['quote'].fillna(0, inplace=True) user_summary_df['reply'].fillna(0, inplace=True) user_summary_df['retweet'].fillna(0, inplace=True) user_summary_df['tweets_in_dataset'].fillna(0, inplace=True) user_summary_df.count() # In[13]: user_summary_df.head() # ### Remove users with no tweets in dataset # In[14]: user_summary_df[user_summary_df.tweets_in_dataset == 0].count() # In[15]: user_summary_df = user_summary_df[user_summary_df.tweets_in_dataset != 0] user_summary_df.count() # ## Tweeter analysis # ### How many of the journalists are male / female? # In[16]: journalist_gender_summary_df = pd.DataFrame({'count':user_summary_df.gender.value_counts(), 'percentage':user_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) journalist_gender_summary_df # ### Summary # # * 25%, 50%, 75% are the percentiles. (Min is equivalent to 0%. Max is equivalent to 100%. 50% is the median.) # * std is standard deviation, normalized by N-1. # #### All # In[17]: user_summary_df[['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe() # #### Female # In[18]: user_summary_df[user_summary_df.gender == 'F'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe() # #### Male # In[19]: user_summary_df[user_summary_df.gender == 'M'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe() # ### Verified # #### Of all journalists, how many are verified? # In[20]: pd.DataFrame({'count':user_summary_df.verified.value_counts(), 'percentage':user_summary_df.verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) # #### Of female journalists, how many are verified? # In[21]: pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) # #### Of male journalists, how many are verified? # In[22]: pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) # ## Mention data prep # ### Load mentions from tweets # Including original tweets only # In[23]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import logging from dateutil.parser import parse as date_parse from utils import load_tweet_df, tweet_type import matplotlib.pyplot as plt logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Set float format so doesn't display scientific notation pd.options.display.float_format = '{:20,.2f}'.format # Simply the tweet on load def mention_transform(tweet): mentions = [] if tweet_type(tweet) == 'original': for mention in tweet.get('entities', {}).get('user_mentions', []): mentions.append({ 'tweet_id': tweet['id_str'], 'user_id': tweet['user']['id_str'], 'screen_name': tweet['user']['screen_name'], 'mention_user_id': mention['id_str'], 'mention_screen_name': mention['screen_name'], 'tweet_created_at': date_parse(tweet['created_at']) }) return mentions base_mention_df = load_tweet_df(mention_transform, ['tweet_id', 'user_id', 'screen_name', 'mention_user_id', 'mention_screen_name', 'tweet_created_at'], dedupe_columns=['tweet_id', 'mention_user_id']) base_mention_df.count() # In[24]: base_mention_df.head() # ### Add gender of mentioner # In[25]: mention_df = base_mention_df.join(user_summary_df['gender'], on='user_id') mention_df.count() # ### How many tweets have mentions? # In[26]: mention_df['tweet_id'].unique().size # ### How many users are mentioned? (All users, not just journalists) # In[27]: mention_df['mention_user_id'].unique().size # ### Limit to mentions of journalists # In[28]: journalists_mention_df = mention_df.join(user_summary_df['gender'], how='inner', on='mention_user_id', rsuffix='_mention') journalists_mention_df.rename(columns = {'gender_mention': 'mention_gender'}, inplace=True) journalists_mention_df.count() # In[29]: journalists_mention_df.head() # ### Functions for summarizing mentions by beltway journalists # In[30]: # Gender of beltway journalists mentioned by beltway journalists def journalist_mention_gender_summary(mention_df): gender_summary_df = pd.DataFrame({'count': mention_df.mention_gender.value_counts(), 'percentage': mention_df.mention_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) gender_summary_df.reset_index(inplace=True) gender_summary_df['avg_mentions'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1) gender_summary_df.set_index('index', inplace=True, drop=True) return gender_summary_df def journalist_mention_summary(mention_df): # Mention count mention_count_df = pd.DataFrame(mention_df.mention_user_id.value_counts().rename('mention_count')) # Mentioning users. That is, the number of unique users mentioning each user. mention_user_id_per_user_df = mention_df[['mention_user_id', 'user_id']].drop_duplicates() mentioning_user_count_df = pd.DataFrame(mention_user_id_per_user_df.groupby('mention_user_id').size(), columns=['mentioning_count']) mentioning_user_count_df.index.name = 'user_id' # Join with user summary journalist_mention_summary_df = user_summary_df.join([mention_count_df, mentioning_user_count_df]) journalist_mention_summary_df.fillna(0, inplace=True) journalist_mention_summary_df = journalist_mention_summary_df.sort_values(['mention_count', 'mentioning_count', 'followers_count'], ascending=False) return journalist_mention_summary_df # Gender of top journalists mentioned by beltway journalists def top_journalist_mention_gender_summary(mention_summary_df, mentioning_count_threshold=0, head=100): top_mention_summary_df = mention_summary_df[mention_summary_df.mentioning_count > mentioning_count_threshold].head(head) return pd.DataFrame({'count': top_mention_summary_df.gender.value_counts(), 'percentage': top_mention_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) # Fields for displaying journalist mention summaries journalist_mention_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'mention_count', 'mentioning_count'] # ## Mentioned analysis # *Note that for each of these, the complete list is being written to CSV in the output directory.* # # ### Original tweets (since mentions are extracted from original tweets) # #### Of the original tweets, how many were posted by male journalists / female journalists? # In[31]: original_tweets_by_gender_df = user_summary_df[['gender', 'original']].groupby('gender').sum() original_tweets_by_gender_df['percentage'] = original_tweets_by_gender_df.original.div(user_summary_df.original.sum()).mul(100).round(1).astype(str) + '%' original_tweets_by_gender_df.reset_index(inplace=True) original_tweets_by_gender_df['avg_original'] = original_tweets_by_gender_df.apply(lambda row: row['original'] / journalist_gender_summary_df.loc[row['gender']]['count'], axis=1) original_tweets_by_gender_df.set_index('gender', inplace=True, drop=True) original_tweets_by_gender_df # #### Who posted the most original tweets? # In[32]: user_summary_df[['screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'original', 'tweets_in_dataset']].sort_values(['original'], ascending=False).head(25) # #### Mentions of all accounts (not just journalists) # #### Of journalists mentioning accounts, which are mentioned the most? # This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely. # In[33]: # Mention count mention_count_screen_name_df = pd.DataFrame(mention_df.mention_screen_name.value_counts().rename('mention_count')) # Count of mentioning users mention_user_id_per_user_screen_name_df = mention_df[['mention_screen_name', 'user_id']].drop_duplicates() mentioning_count_screen_name_df = pd.DataFrame(mention_user_id_per_user_screen_name_df.groupby('mention_screen_name').size(), columns=['mentioning_count']) mentioning_count_screen_name_df.index.name = 'screen_name' all_mentioned_df = mention_count_screen_name_df.join(mentioning_count_screen_name_df) all_mentioned_df.to_csv('output/all_mentioned_by_journalists.csv') all_mentioned_df.head(25) # #### Same, but ordered by the number of journalists mentioning the account # In[34]: all_mentioned_df.sort_values(['mentioning_count', 'mention_count'], ascending=False).head(25) # ### Journalists mentioning journalists # #### Of journalists mentioning journalists, who is mentioned the most? # In[35]: journalists_mention_summary_df = journalist_mention_summary(journalists_mention_df) journalists_mention_summary_df.to_csv('output/journalists_mentioned_by_journalists.csv') journalists_mention_summary_df[journalist_mention_summary_fields].head(25) # #### Same, but ordered by number of journalists mentioning # In[36]: journalists_mention_summary_df[journalist_mention_summary_fields].sort_values(['mentioning_count', 'mention_count'], ascending=False).head(25) # #### Of journalists mentioning other journalists, how many are male / female? # In[37]: journalist_mention_gender_summary(journalists_mention_df) # #### On average how many times are journalists mentioned by other journalists? # In[38]: journalists_mention_summary_df[['mention_count']].describe() # ### Journalists mentioning female journalists # #### Of journalists mentioning female journalists who is mentioned the most? # In[39]: female_journalists_mention_summary_df = journalists_mention_summary_df[journalists_mention_summary_df.gender == 'F'] female_journalists_mention_summary_df.to_csv('output/female_journalists_mentioned_by_journalists.csv') female_journalists_mention_summary_df[journalist_mention_summary_fields].head(25) # #### On average, how many times are female journalists mentioned by journalists? # In[40]: female_journalists_mention_summary_df[['mention_count']].describe() # ### Journalists mentioning male journalists # #### Of journalists mentioning male journalists, who do they mention the most? # In[41]: male_journalists_mention_summary_df = journalists_mention_summary_df[journalists_mention_summary_df.gender == 'M'] male_journalists_mention_summary_df.to_csv('output/male_journalists_mentioned_by_journalists.csv') male_journalists_mention_summary_df[journalist_mention_summary_fields].head(25) # #### On average, how many times are male journalists mentioned by journalists? # In[42]: male_journalists_mention_summary_df[['mention_count']].describe() # ### Female journalists mentioning other journalists # #### Of female journalists mentioning other journalists, who do they mention the most? # In[43]: journalists_mentioned_by_female_summary_df = journalist_mention_summary(journalists_mention_df[journalists_mention_df.gender == 'F']) journalists_mentioned_by_female_summary_df.to_csv('output/journalists_mentioned_by_female_journalists.csv') journalists_mentioned_by_female_summary_df[journalist_mention_summary_fields].head(25) # #### Of female journalists mentioning journalists, how many are male / female? # In[44]: journalist_mention_gender_summary(journalists_mention_df[journalists_mention_df.gender == 'F']) # ### Male journalists mentioning other journalists # #### Of male journalists mentioning other journalists, who do they mention the most? # In[45]: journalists_mentioned_by_male_summary_df = journalist_mention_summary(journalists_mention_df[journalists_mention_df.gender == 'M']) journalists_mentioned_by_male_summary_df.to_csv('output/journalists_mentioned_by_male_journalists.csv') journalists_mentioned_by_male_summary_df[journalist_mention_summary_fields].head(25) # #### Of male journalists mentioning other journalists, how many are male / female? # In[46]: journalist_mention_gender_summary(journalists_mention_df[journalists_mention_df.gender == 'M']) # ## Retweet data prep # ### Load retweets from tweets # Including retweets and quotes # In[47]: # Simply the tweet on load def retweet_transform(tweet): if tweet_type(tweet) in ('retweet', 'quote'): retweet = tweet.get('retweeted_status') or tweet.get('quoted_status') return { 'tweet_id': tweet['id_str'], 'user_id': tweet['user']['id_str'], 'screen_name': tweet['user']['screen_name'], 'retweet_user_id': retweet['user']['id_str'], 'retweet_screen_name': retweet['user']['screen_name'], 'tweet_created_at': date_parse(tweet['created_at']) } return None base_retweet_df = load_tweet_df(retweet_transform, ['tweet_id', 'user_id', 'screen_name', 'retweet_user_id', 'retweet_screen_name', 'tweet_created_at'], dedupe_columns=['tweet_id']) base_retweet_df.count() # In[48]: base_retweet_df.head() # ### Add gender of retweeter # In[49]: retweet_df = base_retweet_df.join(user_summary_df['gender'], on='user_id') retweet_df.count() # ### How many users have been retweeted by journalists? # In[50]: retweet_df['retweet_user_id'].unique().size # ### Limit to retweeted journalists # In[51]: journalists_retweet_df = retweet_df.join(user_summary_df['gender'], how='inner', on='retweet_user_id', rsuffix='_retweet') journalists_retweet_df.rename(columns = {'gender_retweet': 'retweet_gender'}, inplace=True) journalists_retweet_df.count() # In[52]: journalists_retweet_df.head() # ### Functions for summarizing retweets by beltway journalists # In[53]: # Gender of beltway journalists retweeted by beltway journalists def journalist_retweet_gender_summary(retweet_df): gender_summary_df = pd.DataFrame({'count':retweet_df.retweet_gender.value_counts(), 'percentage': retweet_df.retweet_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) gender_summary_df.reset_index(inplace=True) gender_summary_df['avg_retweets'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1) gender_summary_df.set_index('index', inplace=True, drop=True) return gender_summary_df def journalist_retweet_summary(retweet_df): # Retweet count retweet_count_df = pd.DataFrame(retweet_df.retweet_user_id.value_counts().rename('retweet_count')) # Retweeting users. That is, the number of unique users retweeting each user. retweet_user_id_per_user_df = retweet_df[['retweet_user_id', 'user_id']].drop_duplicates() retweeting_user_count_df = pd.DataFrame(retweet_user_id_per_user_df.groupby('retweet_user_id').size(), columns=['retweeting_count']) retweeting_user_count_df.index.name = 'user_id' # Join with user summary journalist_retweet_summary_df = user_summary_df.join([retweet_count_df, retweeting_user_count_df]) journalist_retweet_summary_df.fillna(0, inplace=True) journalist_retweet_summary_df = journalist_retweet_summary_df.sort_values(['retweet_count', 'retweeting_count', 'followers_count'], ascending=False) return journalist_retweet_summary_df # Gender of top journalists retweeted by beltway journalists def top_journalist_retweet_gender_summary(retweet_summary_df, retweeting_count_threshold=0, head=100): top_retweet_summary_df = retweet_summary_df[retweet_summary_df.retweeting_count > retweeting_count_threshold].head(head) return pd.DataFrame({'count': top_retweet_summary_df.gender.value_counts(), 'percentage': top_retweet_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) # Fields for displaying journalist mention summaries journalist_retweet_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'retweet_count', 'retweeting_count'] # ## Retweet analysis # *Note that for each of these, the complete list is being written to CSV in the output directory.* # # ### Retweets of all accounts (not just journalists) # #### Of journalists retweeting other accounts, how many of the retweets are from males / females? # That is, by gender of retweeter. # In[54]: retweets_by_gender_df = user_summary_df[['gender', 'retweet', 'quote']].groupby('gender').sum() retweets_by_gender_df['total'] = retweets_by_gender_df.retweet + retweets_by_gender_df.quote retweets_by_gender_df['percentage'] = retweets_by_gender_df.total.div(retweets_by_gender_df.total.sum()).mul(100).round(1).astype(str) + '%' retweets_by_gender_df.reset_index(inplace=True) retweets_by_gender_df['avg_retweets'] = retweets_by_gender_df.apply(lambda row: row['total'] / journalist_gender_summary_df.loc[row['gender']]['count'], axis=1) retweets_by_gender_df.set_index('gender', inplace=True, drop=True) retweets_by_gender_df # #### Of journalists retweeting other accounts, who retweets the most? # In[55]: retweet_user_summary_df = user_summary_df.loc[:,('screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'retweet', 'quote', 'tweets_in_dataset')] retweet_user_summary_df['retweet_count'] = retweet_user_summary_df.retweet + retweet_user_summary_df.quote retweet_user_summary_df.sort_values(['retweet_count'], ascending=False).head(25) # #### Of journalists retweeting other accounts, who is retweeted the most? # This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely. # In[56]: # Retweet count retweet_count_screen_name_df = pd.DataFrame(retweet_df.retweet_screen_name.value_counts().rename('retweet_count')) # Count of retweeting users retweet_user_id_per_user_screen_name_df = retweet_df[['retweet_screen_name', 'user_id']].drop_duplicates() retweeting_count_screen_name_df = pd.DataFrame(retweet_user_id_per_user_screen_name_df.groupby('retweet_screen_name').size(), columns=['retweeting_count']) retweeting_count_screen_name_df.index.name = 'screen_name' all_retweeted_df = retweet_count_screen_name_df.join(retweeting_count_screen_name_df) all_retweeted_df.to_csv('output/all_retweeted_by_journalists.csv') all_retweeted_df.head(25) # ### Journalists retweeting other journalists # #### Of journalists retweeting other journalists, who is retweeted the most? # In[57]: journalists_retweet_summary_df = journalist_retweet_summary(journalists_retweet_df) journalists_retweet_summary_df.to_csv('output/journalists_retweeted_by_journalists.csv') journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25) # #### Of journalists retweeting other journalists, how many of the retweets are of males / females? # In[58]: journalist_retweet_gender_summary(journalists_retweet_df) # #### On average, how many times are journalists retweeted by other journalists? # In[59]: journalists_retweet_summary_df[['retweet_count']].describe() # ### Journalists retweeting female journalists # #### Of journalists retweeting female journalists, who is retweeted the most? # In[60]: female_journalists_retweet_summary_df = journalists_retweet_summary_df[journalists_retweet_summary_df.gender == 'F'] female_journalists_retweet_summary_df.to_csv('output/female_journalists_retweeted_by_journalists.csv') female_journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25) # #### On average, how many times are female journalists retweeted by other journalists? # In[61]: female_journalists_retweet_summary_df[['retweet_count']].describe() # ### Journalists retweeting male journalists # #### Of journalists retweeting male journalists, who is retweeted the most? # In[62]: male_journalists_retweet_summary_df = journalists_retweet_summary_df[journalists_retweet_summary_df.gender == 'M'] male_journalists_retweet_summary_df.to_csv('output/male_journalists_retweeted_by_journalists.csv') male_journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25) # #### On average, how many times are male journalists retweeted by other journalists? # In[63]: male_journalists_retweet_summary_df[['retweet_count']].describe() # ### Female journalists retweeting other journalists # #### Of female journalists retweeting other journalists, who is retweeted the most? # In[64]: journalists_retweeted_by_female_summary_df = journalist_retweet_summary(journalists_retweet_df[journalists_retweet_df.gender == 'F']) journalists_retweeted_by_female_summary_df.to_csv('output/journalists_retweeted_by_female_journalists.csv') journalists_retweeted_by_female_summary_df[journalist_retweet_summary_fields].head(25) # #### Of female journalists retweeting other journalists, how many are male / female? # Average is of female journalists retweeting other journalists, how many retweets does each male / female journalist receive. # In[65]: journalist_retweet_gender_summary(journalists_retweet_df[journalists_retweet_df.gender == 'F']) # #### On average, how many times do female journalists retweet male / female / all journalists? # That is, retweets per female journalist. # In[66]: female_journalists_retweet_df = journalists_retweet_df[journalists_retweet_df.gender == 'F'] female_journalists_retweet_by_gender_df = pd.merge(user_summary_df[user_summary_df.gender == 'F'], female_journalists_retweet_df.groupby(['user_id', 'retweet_gender']).size().unstack(), how='left', left_index=True, right_index=True)[['F', 'M']] female_journalists_retweet_by_gender_df.fillna(0, inplace=True) female_journalists_retweet_by_gender_df['all'] = female_journalists_retweet_by_gender_df.F + female_journalists_retweet_by_gender_df.M female_journalists_retweet_by_gender_df.describe() # ### Male journalists retweeting other journalists # #### Of male journalists retweeting other journalists, who is retweeted the most? # In[67]: journalists_retweeted_by_male_summary_df = journalist_retweet_summary(journalists_retweet_df[journalists_retweet_df.gender == 'M']) journalists_retweeted_by_male_summary_df.to_csv('output/journalists_retweeted_by_male_journalists.csv') journalists_retweeted_by_male_summary_df[journalist_retweet_summary_fields].head(25) # #### Of male journalists retweeting other journalists, how many are male / female? # Average is of male journalists retweeting other journalists, how many retweets does each male / female journalist receive. # In[68]: journalist_retweet_gender_summary(journalists_retweet_df[journalists_retweet_df.gender == 'M']) # #### On average, how many times do male journalists retweet male / female / all journalists? # That is, retweets per male journalist. # In[69]: male_journalists_retweet_df = journalists_retweet_df[journalists_retweet_df.gender == 'M'] male_journalists_retweet_by_gender_df = pd.merge(user_summary_df[user_summary_df.gender == 'M'], male_journalists_retweet_df.groupby(['user_id', 'retweet_gender']).size().unstack(), how='left', left_index=True, right_index=True)[['F', 'M']] male_journalists_retweet_by_gender_df.fillna(0, inplace=True) male_journalists_retweet_by_gender_df['all'] = male_journalists_retweet_by_gender_df.F + male_journalists_retweet_by_gender_df.M male_journalists_retweet_by_gender_df.describe() # In[ ]: