#!/usr/bin/env python # coding: utf-8 #

Table of Contents

#
# # Gender dynamics # ## Tweet data prep # ### Load the tweets # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import logging from dateutil.parser import parse as date_parse from utils import load_tweet_df, tweet_type import matplotlib.pyplot as plt logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Set float format so doesn't display scientific notation pd.options.display.float_format = '{:20,.2f}'.format def tweet_transform(tweet): return { 'tweet_id': tweet['id_str'], 'tweet_created_at': date_parse(tweet['created_at']), 'user_id': tweet['user']['id_str'], 'screen_name': tweet['user']['screen_name'], 'tweet_type': tweet_type(tweet) } tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'tweet_type'], dedupe_columns=['tweet_id']) tweet_df.count() # In[2]: tweet_df.head() # ## Tweeter data prep # ### Prepare the tweeter data # This comes from the following sources: # 1. User lookup: These are lists of users exported from SFM. These are the final set of beltway journalists. Accounts that were suspended or deleted have been removed from this list. Also, this list will include users that did not tweet (i.e., have no tweets in dataset). # 2. Tweets in the dataset: Used to generate tweet counts per tweeter. However, since some beltway journalists may not have tweeted, this may be a subset of the user lookup. Also, it may include the tweets of some users that were later excluded because their accounts were suspended or deleted or determined to not be beltway journalists. # 3. User info lookup: Information on users that was manually coded in the beltway journalist spreadsheet or looked up from Twitter's API. This includes some accounts that were excluded from data collection for various reasons such as working for a foreign news organization or no longer working as a beltway journalist. Thus, these are a superset of the user lookup. # # Thus, the tweeter data should include tweet and user info data only from users in the user lookup. # ### Load user lookup # In[3]: user_lookup_filepaths = ('lookups/senate_press_lookup.csv', 'lookups/periodical_press_lookup.csv', 'lookups/radio_and_television_lookup.csv') user_lookup_df = pd.concat((pd.read_csv(user_lookup_filepath, usecols=['Uid', 'Token'], dtype={'Uid': str}) for user_lookup_filepath in user_lookup_filepaths)) user_lookup_df.set_index('Uid', inplace=True) user_lookup_df.rename(columns={'Token': 'screen_name'}, inplace=True) user_lookup_df.index.names = ['user_id'] # Some users may be in multiple lists, so need to drop duplicates user_lookup_df = user_lookup_df[~user_lookup_df.index.duplicated()] user_lookup_df.count() # In[4]: user_lookup_df.head() # ### Load user info # In[5]: user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['user_id', 'name', 'organization', 'position', 'gender', 'followers_count', 'following_count', 'tweet_count', 'user_created_at', 'verified', 'protected'], dtype={'user_id': str}).set_index(['user_id']) user_info_df.count() # In[6]: user_info_df.head() # In[7]: user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack() user_tweet_count_df.fillna(0, inplace=True) user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet # In[8]: user_summary_df = user_lookup_df.join((user_info_df, user_tweet_count_df), how='left') # Fill Nans user_summary_df['organization'].fillna('', inplace=True) user_summary_df['original'].fillna(0, inplace=True) user_summary_df['quote'].fillna(0, inplace=True) user_summary_df['reply'].fillna(0, inplace=True) user_summary_df['retweet'].fillna(0, inplace=True) user_summary_df['tweets_in_dataset'].fillna(0, inplace=True) user_summary_df.count() # In[9]: user_summary_df.head() # ### Remove users with no tweets in dataset # In[10]: user_summary_df[user_summary_df.tweets_in_dataset == 0].count() # In[11]: user_summary_df = user_summary_df[user_summary_df.tweets_in_dataset != 0] user_summary_df.count() # ### Gender # In[12]: journalist_gender_summary_df = pd.DataFrame({'count':user_summary_df.gender.value_counts(), 'percentage':user_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) journalist_gender_summary_df # ## Reply data prep # ### Load replies from tweets # In[13]: # Simply the tweet on load def reply_transform(tweet): if tweet_type(tweet) == 'reply': return { 'tweet_id': tweet['id_str'], 'user_id': tweet['user']['id_str'], 'screen_name': tweet['user']['screen_name'], 'reply_to_user_id': tweet['in_reply_to_user_id_str'], 'reply_to_screen_name': tweet['in_reply_to_screen_name'], 'tweet_created_at': date_parse(tweet['created_at']) } return None base_reply_df = load_tweet_df(reply_transform, ['tweet_id', 'user_id', 'screen_name', 'reply_to_user_id', 'reply_to_screen_name', 'tweet_created_at'], dedupe_columns=['tweet_id']) base_reply_df.count() # In[14]: base_reply_df.head() # ### Add gender of replier # In[15]: reply_df = base_reply_df.join(user_summary_df['gender'], on='user_id') reply_df.count() # ### How may user have been replied to by journalists? # In[16]: reply_df['reply_to_user_id'].unique().size # ### Limit to beltway journalists # In[17]: journalists_reply_df = reply_df.join(user_summary_df['gender'], how='inner', on='reply_to_user_id', rsuffix='_reply') journalists_reply_df.rename(columns = {'gender_reply': 'reply_to_gender'}, inplace=True) journalists_reply_df.count() # In[18]: journalists_reply_df.head() # ### Functions for summarizing replies by beltway journalists # In[19]: # Gender of beltway journalists replied to by beltway journalists def journalist_reply_gender_summary(reply_df): gender_summary_df = pd.DataFrame({'count':reply_df.reply_to_gender.value_counts(), 'percentage': reply_df.reply_to_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) gender_summary_df.reset_index(inplace=True) gender_summary_df['avg_replies'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1) gender_summary_df.set_index('index', inplace=True, drop=True) return gender_summary_df # Reply to beltway journalists by beltway journalists def journalist_reply_summary(reply_df): # Reply to count reply_count_df = pd.DataFrame(reply_df.reply_to_user_id.value_counts().rename('reply_to_count')) # Replying to users. That is, the number of unique users replying to each user. reply_to_user_id_per_user_df = reply_df[['reply_to_user_id', 'user_id']].drop_duplicates() replying_to_user_count_df = pd.DataFrame(reply_to_user_id_per_user_df.groupby('reply_to_user_id').size(), columns=['replying_count']) replying_to_user_count_df.index.name = 'user_id' # Join with user summary journalist_reply_summary_df = user_summary_df.join([reply_count_df, replying_to_user_count_df]) journalist_reply_summary_df.fillna(0, inplace=True) journalist_reply_summary_df = journalist_reply_summary_df.sort_values(['reply_to_count', 'replying_count', 'followers_count'], ascending=False) return journalist_reply_summary_df # Gender of top journalists replied to by beltway journalists def top_journalist_reply_gender_summary(reply_summary_df, replying_count_threshold=0, head=100): top_reply_summary_df = reply_summary_df[reply_summary_df.replying_count > replying_count_threshold].head(head) return pd.DataFrame({'count': top_reply_summary_df.gender.value_counts(), 'percentage': top_reply_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) # Fields for displaying journalist mention summaries journalist_reply_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'reply_to_count', 'replying_count'] # ## Reply analysis # *Note that for each of these, the complete list is being written to CSV in the output directory.* # # ### Of replies by journalists, how many are by males / females? # In[20]: replies_by_gender_df = user_summary_df[['gender', 'reply']].groupby('gender').sum() replies_by_gender_df['percentage'] = replies_by_gender_df.reply.div(replies_by_gender_df.reply.sum()).mul(100).round(1).astype(str) + '%' replies_by_gender_df.reset_index(inplace=True) replies_by_gender_df['avg_replies'] = replies_by_gender_df.apply(lambda row: row['reply'] / journalist_gender_summary_df.loc[row['gender']]['count'], axis=1) replies_by_gender_df.set_index('gender', inplace=True, drop=True) # return gender_summary_df replies_by_gender_df # ### Which journalists reply the most? # In[21]: user_summary_df[['screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'reply', 'tweets_in_dataset']].sort_values(['reply'], ascending=False).head(25) # ### Replies to all account (not just journalists) # This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely. # #### Of journalists replying to other accounts, who do they reply to the most? # In[22]: # Reply to count reply_to_count_screen_name_df = pd.DataFrame(reply_df.reply_to_screen_name.value_counts().rename('reply_to_count')) # Count of replying users reply_to_user_id_per_user_screen_name_df = reply_df[['reply_to_screen_name', 'user_id']].drop_duplicates() replying_count_screen_name_df = pd.DataFrame(reply_to_user_id_per_user_screen_name_df.groupby('reply_to_screen_name').size(), columns=['replying_count']) replying_count_screen_name_df.index.name = 'screen_name' all_replied_to_df = reply_to_count_screen_name_df.join(replying_count_screen_name_df) all_replied_to_df.to_csv('output/all_replied_to_by_journalists.csv') all_replied_to_df.head(25) # ### Journalists replying to other journalists # #### Of journalists replying to other journalists, who do they reply to the most? # In[23]: journalists_reply_summary_df = journalist_reply_summary(journalists_reply_df) journalists_reply_summary_df.to_csv('output/journalists_replied_to_by_journalists.csv') journalists_reply_summary_df[journalist_reply_summary_fields].head(25) # #### Of journalists replying to other journalists, how many that they reply to are male / female? # In[24]: journalist_reply_gender_summary(journalists_reply_df) # #### On average, how many times do journalists reply to each journalists? # In[25]: journalists_reply_summary_df[['reply_to_count']].describe() # ### Journalists replying to female journalists # #### Of journalists replying to female journalists, which female journalists are replied to the most? # In[26]: female_journalists_reply_summary_df = journalists_reply_summary_df[journalists_reply_summary_df.gender == 'F'] female_journalists_reply_summary_df.to_csv('output/female_journalists_replied_to_by_journalists.csv') female_journalists_reply_summary_df[journalist_reply_summary_fields].head(25) # #### On average, how many times do journalists reply to each female journalist? # In[27]: female_journalists_reply_summary_df[['reply_to_count']].describe() # ### Journalists replying to male journalists # #### Of journalists replying to male journalists, which male journalists are replied to the most? # In[28]: male_journalists_reply_summary_df = journalists_reply_summary_df[journalists_reply_summary_df.gender == 'M'] male_journalists_reply_summary_df.to_csv('output/male_journalists_replied_to_by_journalists.csv') male_journalists_reply_summary_df[journalist_reply_summary_fields].head(25) # #### On average, how often do journalists reply to each male journalist? # In[29]: male_journalists_reply_summary_df[['reply_to_count']].describe() # ### Female journalists replying to journalists # #### Of female journalists replying to journalists, who do they reply to the most? # In[30]: journalists_replied_to_by_female_summary_df = journalist_reply_summary(journalists_reply_df[journalists_reply_df.gender == 'F']) journalists_replied_to_by_female_summary_df.to_csv('output/journalists_replied_to_by_female_journalists.csv') journalists_replied_to_by_female_summary_df[journalist_reply_summary_fields].head(25) # #### Of female journalists replying to journalists, how many males / females do they reply to? # In[31]: journalist_reply_gender_summary(journalists_reply_df[journalists_reply_df.gender == 'F']) # ### Male journalists replying to journalists # #### Of male journalists replying to journalists, who do they reply to the most? # In[32]: journalists_replied_to_by_male_summary_df = journalist_reply_summary(journalists_reply_df[journalists_reply_df.gender == 'M']) journalists_replied_to_by_male_summary_df.to_csv('output/journalists_replied_to_by_male_journalists.csv') journalists_replied_to_by_male_summary_df[journalist_reply_summary_fields].head(25) # #### Of male journalists replying to journalists, how many are male / female? # In[33]: journalist_reply_gender_summary(journalists_reply_df[journalists_reply_df.gender == 'M']) # ## Following data prep # ### Load following # Users that are followed by beltway journalists # In[34]: base_follower_to_followed_df = pd.read_csv('source_data/follower_to_followed.csv', names=['follower_user_id', 'followed_user_id'], dtype={'follower_user_id': np.str, 'followed_user_id': np.str}) base_follower_to_followed_df.drop_duplicates(inplace=True) base_follower_to_followed_df.count() # In[35]: base_follower_to_followed_df.head() # In[36]: user_info_df.head() # In[37]: # This will drop followers of journalists that have no tweets follower_to_followed_df = base_follower_to_followed_df.join(user_summary_df['gender'], on='follower_user_id', how='inner') follower_to_followed_df.count() # In[38]: follower_to_followed_df.head() # ### Load followed users # In[39]: followed_screen_name_lookup_df = pd.read_csv('source_data/followed.csv', names=['screen_name', 'user_id'], dtype={'user_id': np.str}).set_index(['user_id']) followed_screen_name_lookup_df.head() # ### Limit to beltway journalists # In[40]: follower_to_journalist_followed_df = follower_to_followed_df.join(user_summary_df['gender'], how='inner', on='followed_user_id', rsuffix='_followed') follower_to_journalist_followed_df.rename(columns = {'gender_followed': 'followed_gender'}, inplace=True) follower_to_journalist_followed_df.count() # In[41]: follower_to_journalist_followed_df.head() # ### Functions for summarizing following by beltway journalists # In[42]: # Gender of beltway journalists followed by beltway journalists def journalist_followed_gender_summary(follower_to_followed_df): gender_summary_df = pd.DataFrame({'count':follower_to_followed_df.followed_gender.value_counts(), 'percentage': follower_to_followed_df.followed_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) gender_summary_df.reset_index(inplace=True) gender_summary_df['avg_followed'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1) gender_summary_df.set_index('index', inplace=True, drop=True) return gender_summary_df def journalist_following_summary(follower_to_followed_df): # Following count following_count_df = pd.DataFrame(follower_to_followed_df.followed_user_id.value_counts().rename('journalist_follower_count')) # Join with user summary journalist_following_summary_df = user_summary_df.join(following_count_df) journalist_following_summary_df.fillna(0, inplace=True) journalist_following_summary_df = journalist_following_summary_df.sort_values(['journalist_follower_count', 'followers_count'], ascending=False) return journalist_following_summary_df # Gender of top journalists followed by beltway journalists def top_journalist_followed_gender_summary(followed_summary_df, head=100): top_followed_summary_df = followed_summary_df.head(head) return pd.DataFrame({'count': top_followed_summary_df.gender.value_counts(), 'percentage': top_followed_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}) # Fields for displaying journalist mention summaries journalist_following_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'journalist_follower_count'] # ## Following analysis # ### Journalists following all accounts # #### Of journalists following all accounts, who do they follow the most? # In[43]: # Following count all_followed_df = pd.DataFrame(follower_to_followed_df.followed_user_id.value_counts().rename('following_count')).join(followed_screen_name_lookup_df) all_followed_df.to_csv('output/all_followed_by_journalists.csv') all_followed_df.head(25) # ### Journalists following journalists # #### Of all journalists following by journalists, who is followed the most? # In[44]: follower_to_journalist_followed_summary_df = journalist_following_summary(follower_to_journalist_followed_df) follower_to_journalist_followed_summary_df.to_csv('output/journalists_followed_by_journalists.csv') follower_to_journalist_followed_summary_df[journalist_following_summary_fields].head(25) # #### Of journalists following journalists, what how many of the followed journalists are male / female? # In[45]: journalist_followed_gender_summary(follower_to_journalist_followed_df) # #### On average, how many journalists follow each journalist? # In[46]: follower_to_journalist_followed_summary_df[['journalist_follower_count']].describe() # ### Journalists following female journalists # #### Of journalists following female journalists, which female journalists do they follow the most? # In[47]: follower_to_female_journalist_followed_df = follower_to_journalist_followed_summary_df[follower_to_journalist_followed_summary_df.gender == 'F'] follower_to_female_journalist_followed_df.to_csv('output/female_journalists_followed_by_journalists.csv') follower_to_female_journalist_followed_df[journalist_following_summary_fields].head(25) # #### On average, how many journalists follow each female journalist? # In[48]: follower_to_female_journalist_followed_df[['journalist_follower_count']].describe() # ### Journalists following male journalists # In[49]: follower_to_male_journalist_followed_df = follower_to_journalist_followed_summary_df[follower_to_journalist_followed_summary_df.gender == 'M'] follower_to_male_journalist_followed_df.to_csv('output/male_journalists_followed_by_journalists.csv') follower_to_male_journalist_followed_df[journalist_following_summary_fields].head(25) # #### On average, how many journalists follow each male journalists? # In[50]: follower_to_male_journalist_followed_df[['journalist_follower_count']].describe() # ### Female journalists following journalists # #### Of female journalists following journalists, who do they follow the most? # In[51]: female_follower_to_journalist_followed_df = journalist_following_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'F']) female_follower_to_journalist_followed_df.to_csv('output/journalists_followed_by_female_journalists.csv') female_follower_to_journalist_followed_df[journalist_following_summary_fields].head(25) # #### Of female journalists following journalists, how many of the followed journalists are male / female? # In[52]: journalist_followed_gender_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'F']) # ### Male journalists following journalists # ### Of male journalists following journalists, who do they follow the most? # In[53]: male_follower_to_journalist_followed_df = journalist_following_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'M']) male_follower_to_journalist_followed_df.to_csv('output/journalists_followed_by_male_journalists.csv') male_follower_to_journalist_followed_df[journalist_following_summary_fields].head(25) # #### Of male journalists following journalists, how many of the following journalists are male / female? # In[54]: journalist_followed_gender_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'M']) # In[ ]: