%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format
def tweet_transform(tweet):
return {
'tweet_id': tweet['id_str'],
'tweet_created_at': date_parse(tweet['created_at']),
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'tweet_type': tweet_type(tweet)
}
tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'tweet_type'], dedupe_columns=['tweet_id'])
tweet_df.count()
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000 DEBUG:root:Loaded 800000
tweet_id 817136 user_id 817136 screen_name 817136 tweet_created_at 817136 tweet_type 817136 dtype: int64
tweet_df.head()
tweet_id | user_id | screen_name | tweet_created_at | tweet_type | |
---|---|---|---|---|---|
0 | 872631046088601600 | 327862439 | jonathanvswan | 2017-06-08 01:47:08+00:00 | retweet |
1 | 872610483647516673 | 327862439 | jonathanvswan | 2017-06-08 00:25:26+00:00 | retweet |
2 | 872609618626826240 | 327862439 | jonathanvswan | 2017-06-08 00:22:00+00:00 | retweet |
3 | 872605974699311104 | 327862439 | jonathanvswan | 2017-06-08 00:07:31+00:00 | retweet |
4 | 872603191518646276 | 327862439 | jonathanvswan | 2017-06-07 23:56:27+00:00 | retweet |
This comes from the following sources:
Thus, the tweeter data should include tweet and user info data only from users in the user lookup.
user_lookup_filepaths = ('lookups/senate_press_lookup.csv',
'lookups/periodical_press_lookup.csv',
'lookups/radio_and_television_lookup.csv')
user_lookup_df = pd.concat((pd.read_csv(user_lookup_filepath, usecols=['Uid', 'Token'], dtype={'Uid': str}) for user_lookup_filepath in user_lookup_filepaths))
user_lookup_df.set_index('Uid', inplace=True)
user_lookup_df.rename(columns={'Token': 'screen_name'}, inplace=True)
user_lookup_df.index.names = ['user_id']
# Some users may be in multiple lists, so need to drop duplicates
user_lookup_df = user_lookup_df[~user_lookup_df.index.duplicated()]
user_lookup_df.count()
screen_name 2487 dtype: int64
user_lookup_df.head()
screen_name | |
---|---|
user_id | |
23455653 | abettel |
33919343 | AshleyRParker |
18580432 | b_fung |
399225358 | b_muzz |
18834692 | becca_milfeld |
user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['user_id', 'name', 'organization', 'position',
'gender', 'followers_count', 'following_count', 'tweet_count',
'user_created_at', 'verified', 'protected'],
dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()
name 2506 organization 2477 position 2503 gender 2505 followers_count 2506 following_count 2506 tweet_count 2506 user_created_at 2506 verified 2506 protected 2506 dtype: int64
user_info_df.head()
name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | |
---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||
20711445 | Glinski, Nina | NaN | Freelance Reporter | F | 963 | 507 | 909 | Thu Feb 12 20:00:53 +0000 2009 | False | False |
258917371 | Enders, David | NaN | Journalist | M | 1444 | 484 | 6296 | Mon Feb 28 19:52:03 +0000 2011 | True | False |
297046834 | Barakat, Matthew | Associated Press | Northern Virginia Correspondent | M | 759 | 352 | 631 | Wed May 11 20:55:24 +0000 2011 | True | False |
455585786 | Atkins, Kimberly | Boston Herald | Chief Washington Reporter/Columnist | F | 2944 | 2691 | 6277 | Thu Jan 05 08:26:46 +0000 2012 | True | False |
42584840 | Vlahou, Toula | CQ Roll Call | Editor & Podcast Producer | F | 2703 | 201 | 6366 | Tue May 26 07:41:38 +0000 2009 | False | False |
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet
user_summary_df = user_lookup_df.join((user_info_df, user_tweet_count_df), how='left')
# Fill Nans
user_summary_df['organization'].fillna('', inplace=True)
user_summary_df['original'].fillna(0, inplace=True)
user_summary_df['quote'].fillna(0, inplace=True)
user_summary_df['reply'].fillna(0, inplace=True)
user_summary_df['retweet'].fillna(0, inplace=True)
user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)
user_summary_df.count()
screen_name 2487 name 2487 organization 2487 position 2484 gender 2486 followers_count 2487 following_count 2487 tweet_count 2487 user_created_at 2487 verified 2487 protected 2487 original 2487 quote 2487 reply 2487 retweet 2487 tweets_in_dataset 2487 dtype: int64
user_summary_df.head()
screen_name | name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | original | quote | reply | retweet | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||||||||
23455653 | abettel | Bettelheim, Adriel | Politico | Health Care Editor | F | 2664 | 1055 | 15990 | Mon Mar 09 16:32:20 +0000 2009 | True | False | 289.00 | 12.00 | 6.00 | 52.00 | 359.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | White House Reporter | F | 122382 | 2342 | 12433 | Tue Apr 21 14:28:57 +0000 2009 | True | False | 172.00 | 67.00 | 11.00 | 120.00 | 370.00 |
18580432 | b_fung | Fung, Brian | Washington Post | Tech Reporter | M | 16558 | 2062 | 44799 | Sat Jan 03 15:15:57 +0000 2009 | True | False | 257.00 | 85.00 | 205.00 | 82.00 | 629.00 |
399225358 | b_muzz | Murray, Brendan | Bloomberg News | Managing Editor, U.S. Economy | M | 624 | 382 | 360 | Thu Oct 27 05:34:05 +0000 2011 | True | False | 3.00 | 0.00 | 0.00 | 5.00 | 8.00 |
18834692 | becca_milfeld | Milfeld, Becca | Agence France-Presse | English Desk Editor and Journalist | F | 483 | 993 | 1484 | Sat Jan 10 13:58:43 +0000 2009 | False | False | 3.00 | 14.00 | 0.00 | 7.00 | 24.00 |
user_summary_df[user_summary_df.tweets_in_dataset == 0].count()
screen_name 195 name 195 organization 195 position 195 gender 194 followers_count 195 following_count 195 tweet_count 195 user_created_at 195 verified 195 protected 195 original 195 quote 195 reply 195 retweet 195 tweets_in_dataset 195 dtype: int64
user_summary_df = user_summary_df[user_summary_df.tweets_in_dataset != 0]
user_summary_df.count()
screen_name 2292 name 2292 organization 2292 position 2289 gender 2292 followers_count 2292 following_count 2292 tweet_count 2292 user_created_at 2292 verified 2292 protected 2292 original 2292 quote 2292 reply 2292 retweet 2292 tweets_in_dataset 2292 dtype: int64
journalist_gender_summary_df = pd.DataFrame({'count':user_summary_df.gender.value_counts(), 'percentage':user_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
journalist_gender_summary_df
count | percentage | |
---|---|---|
M | 1299 | 56.7% |
F | 993 | 43.3% |
# Simply the tweet on load
def reply_transform(tweet):
if tweet_type(tweet) == 'reply':
return {
'tweet_id': tweet['id_str'],
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'reply_to_user_id': tweet['in_reply_to_user_id_str'],
'reply_to_screen_name': tweet['in_reply_to_screen_name'],
'tweet_created_at': date_parse(tweet['created_at'])
}
return None
base_reply_df = load_tweet_df(reply_transform, ['tweet_id', 'user_id', 'screen_name', 'reply_to_user_id',
'reply_to_screen_name', 'tweet_created_at'],
dedupe_columns=['tweet_id'])
base_reply_df.count()
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000 DEBUG:root:Loaded 800000
tweet_id 126254 user_id 126254 screen_name 126254 reply_to_user_id 126254 reply_to_screen_name 126254 tweet_created_at 126254 dtype: int64
base_reply_df.head()
tweet_id | user_id | screen_name | reply_to_user_id | reply_to_screen_name | tweet_created_at | |
---|---|---|---|---|---|---|
0 | 872495244062978048 | 327862439 | jonathanvswan | 59331128 | PhilipRucker | 2017-06-07 16:47:31+00:00 |
1 | 872473152160399361 | 327862439 | jonathanvswan | 2856617865 | RPhuket | 2017-06-07 15:19:43+00:00 |
2 | 872266930341728256 | 327862439 | jonathanvswan | 1854392378 | hrm_1973 | 2017-06-07 01:40:16+00:00 |
3 | 872250430109175809 | 327862439 | jonathanvswan | 390985197 | MikeBastasch | 2017-06-07 00:34:42+00:00 |
4 | 872218322187767808 | 327862439 | jonathanvswan | 407013776 | burgessev | 2017-06-06 22:27:07+00:00 |
reply_df = base_reply_df.join(user_summary_df['gender'], on='user_id')
reply_df.count()
tweet_id 126254 user_id 126254 screen_name 126254 reply_to_user_id 126254 reply_to_screen_name 126254 tweet_created_at 126254 gender 126254 dtype: int64
reply_df['reply_to_user_id'].unique().size
31034
journalists_reply_df = reply_df.join(user_summary_df['gender'], how='inner', on='reply_to_user_id', rsuffix='_reply')
journalists_reply_df.rename(columns = {'gender_reply': 'reply_to_gender'}, inplace=True)
journalists_reply_df.count()
tweet_id 43390 user_id 43390 screen_name 43390 reply_to_user_id 43390 reply_to_screen_name 43390 tweet_created_at 43390 gender 43390 reply_to_gender 43390 dtype: int64
journalists_reply_df.head()
tweet_id | user_id | screen_name | reply_to_user_id | reply_to_screen_name | tweet_created_at | gender | reply_to_gender | |
---|---|---|---|---|---|---|---|---|
4 | 872218322187767808 | 327862439 | jonathanvswan | 407013776 | burgessev | 2017-06-06 22:27:07+00:00 | M | M |
234 | 871795694020984833 | 195840597 | JNicholsonInDC | 407013776 | burgessev | 2017-06-05 18:27:45+00:00 | M | M |
572 | 870371176866041856 | 163589845 | PoliticoKevin | 407013776 | burgessev | 2017-06-01 20:07:13+00:00 | M | M |
728 | 870659438901940224 | 115564212 | IsaacDovere | 407013776 | burgessev | 2017-06-02 15:12:40+00:00 | M | M |
731 | 872473152143667201 | 167024520 | rachaelmbade | 407013776 | burgessev | 2017-06-07 15:19:43+00:00 | F | M |
# Gender of beltway journalists replied to by beltway journalists
def journalist_reply_gender_summary(reply_df):
gender_summary_df = pd.DataFrame({'count':reply_df.reply_to_gender.value_counts(),
'percentage': reply_df.reply_to_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
gender_summary_df.reset_index(inplace=True)
gender_summary_df['avg_replies'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1)
gender_summary_df.set_index('index', inplace=True, drop=True)
return gender_summary_df
# Reply to beltway journalists by beltway journalists
def journalist_reply_summary(reply_df):
# Reply to count
reply_count_df = pd.DataFrame(reply_df.reply_to_user_id.value_counts().rename('reply_to_count'))
# Replying to users. That is, the number of unique users replying to each user.
reply_to_user_id_per_user_df = reply_df[['reply_to_user_id', 'user_id']].drop_duplicates()
replying_to_user_count_df = pd.DataFrame(reply_to_user_id_per_user_df.groupby('reply_to_user_id').size(), columns=['replying_count'])
replying_to_user_count_df.index.name = 'user_id'
# Join with user summary
journalist_reply_summary_df = user_summary_df.join([reply_count_df, replying_to_user_count_df])
journalist_reply_summary_df.fillna(0, inplace=True)
journalist_reply_summary_df = journalist_reply_summary_df.sort_values(['reply_to_count', 'replying_count', 'followers_count'], ascending=False)
return journalist_reply_summary_df
# Gender of top journalists replied to by beltway journalists
def top_journalist_reply_gender_summary(reply_summary_df, replying_count_threshold=0, head=100):
top_reply_summary_df = reply_summary_df[reply_summary_df.replying_count > replying_count_threshold].head(head)
return pd.DataFrame({'count': top_reply_summary_df.gender.value_counts(),
'percentage': top_reply_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
# Fields for displaying journalist mention summaries
journalist_reply_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'reply_to_count', 'replying_count']
Note that for each of these, the complete list is being written to CSV in the output directory.
replies_by_gender_df = user_summary_df[['gender', 'reply']].groupby('gender').sum()
replies_by_gender_df['percentage'] = replies_by_gender_df.reply.div(replies_by_gender_df.reply.sum()).mul(100).round(1).astype(str) + '%'
replies_by_gender_df.reset_index(inplace=True)
replies_by_gender_df['avg_replies'] = replies_by_gender_df.apply(lambda row: row['reply'] / journalist_gender_summary_df.loc[row['gender']]['count'], axis=1)
replies_by_gender_df.set_index('gender', inplace=True, drop=True)
# return gender_summary_df
replies_by_gender_df
reply | percentage | avg_replies | |
---|---|---|---|
gender | |||
F | 31,831.00 | 25.2% | 32.06 |
M | 94,423.00 | 74.8% | 72.69 |
user_summary_df[['screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'reply', 'tweets_in_dataset']].sort_values(['reply'], ascending=False).head(25)
screen_name | name | organization | gender | followers_count | tweet_count | reply | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
user_id | ||||||||
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 208763 | 9,033.00 | 11,432.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 205504 | 3,917.00 | 6,244.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 42497 | 2,040.00 | 3,960.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 41620 | 1,949.00 | 5,567.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 142150 | 1,714.00 | 3,983.00 |
63717541 | phillyrich1 | Weinstein, Richard | C–SPAN | M | 3827 | 27341 | 1,532.00 | 2,261.00 |
203226736 | SharylAttkisson | Attkisson, Sharyl | Sinclair Broadcast Group | F | 132973 | 24539 | 1,458.00 | 2,154.00 |
16812908 | crousselle | Rousselle, Christine | Townhall | F | 5327 | 118713 | 1,089.00 | 2,351.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 148143 | 1,040.00 | 5,078.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 67526 | 1,026.00 | 3,066.00 |
27882000 | jamiedupree | Dupree, Jamie | Cox Broadcasting | M | 140848 | 46181 | 993.00 | 2,108.00 |
3372900155 | samtayrey | Reyes, Samantha | CNN | F | 10344 | 4783 | 933.00 | 1,349.00 |
132482136 | Yaro_RT | Yaroshevsky, Alexey | RTTV America | M | 12968 | 26795 | 910.00 | 1,199.00 |
46955476 | GrahamDavidA | Graham, David A. | The Atlantic | M | 22112 | 93391 | 908.00 | 1,566.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 92203 | 901.00 | 5,187.00 |
25702314 | EricMGarcia | Garcia, Eric M. | CQ Roll Call | M | 3094 | 44783 | 863.00 | 3,584.00 |
12245632 | jackshafer | Shafer, Jack | Politico | M | 73996 | 44726 | 861.00 | 2,016.00 |
273540698 | MKTWgoldstein | Goldstein, Steven | MarketWatch | M | 10185 | 41497 | 857.00 | 1,897.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 51628 | 853.00 | 2,022.00 |
6904552 | juliemason | Mason, Julie | Sirius XM Satellite Radio | F | 31276 | 29214 | 852.00 | 1,213.00 |
225265639 | ddale8 | Dale, Daniel | Toronto Star | M | 180671 | 69807 | 848.00 | 2,496.00 |
15837659 | jbendery | Bendery, Jennifer | Huffington Post | M | 41000 | 65406 | 844.00 | 2,600.00 |
15146659 | JSwiftTWS | Swift, James A. | Weekly Standard | M | 5691 | 84245 | 830.00 | 2,612.00 |
227790723 | RichardRubinDC | Rubin, Richard | Bloomberg News | M | 13015 | 17796 | 807.00 | 1,312.00 |
14517538 | derekwillis | Willis, Derek | ProPublica | M | 18049 | 79502 | 781.00 | 1,811.00 |
This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely.
# Reply to count
reply_to_count_screen_name_df = pd.DataFrame(reply_df.reply_to_screen_name.value_counts().rename('reply_to_count'))
# Count of replying users
reply_to_user_id_per_user_screen_name_df = reply_df[['reply_to_screen_name', 'user_id']].drop_duplicates()
replying_count_screen_name_df = pd.DataFrame(reply_to_user_id_per_user_screen_name_df.groupby('reply_to_screen_name').size(), columns=['replying_count'])
replying_count_screen_name_df.index.name = 'screen_name'
all_replied_to_df = reply_to_count_screen_name_df.join(replying_count_screen_name_df)
all_replied_to_df.to_csv('output/all_replied_to_by_journalists.csv')
all_replied_to_df.head(25)
reply_to_count | replying_count | |
---|---|---|
ericgeller | 1980 | 75 |
chrisgeidner | 1901 | 37 |
dylanlscott | 1091 | 65 |
JaredRizzi | 750 | 46 |
StevenTDennis | 745 | 93 |
AlexParkerDC | 720 | 23 |
sahilkapur | 662 | 35 |
jseldin | 653 | 2 |
MEPFuller | 522 | 92 |
amaxsmith | 498 | 6 |
ddale8 | 495 | 20 |
CraigCaplan | 388 | 8 |
ChuckWendig | 372 | 1 |
pbump | 355 | 43 |
kelmej | 340 | 29 |
benjamin_oc | 322 | 11 |
KimberlyRobinsn | 321 | 7 |
darth | 315 | 32 |
ZoeTillman | 311 | 8 |
RichardRubinDC | 305 | 41 |
sdonnan | 304 | 7 |
AaronMehta | 304 | 35 |
MikeSacksEsq | 299 | 18 |
heathdwilliams | 298 | 1 |
ryanbeckwith | 297 | 49 |
journalists_reply_summary_df = journalist_reply_summary(journalists_reply_df)
journalists_reply_summary_df.to_csv('output/journalists_replied_to_by_journalists.csv')
journalists_reply_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 1,980.00 | 75.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 1,901.00 | 37.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 1,091.00 | 65.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 750.00 | 46.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 745.00 | 93.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 720.00 | 23.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 662.00 | 35.00 |
583821006 | jseldin | Seldin, Jeff | Voice of America | M | 5365 | 653.00 | 2.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 522.00 | 92.00 |
44951698 | amaxsmith | Smith, Max | WTOP Radio | M | 4726 | 498.00 | 6.00 |
225265639 | ddale8 | Dale, Daniel | Toronto Star | M | 180671 | 495.00 | 20.00 |
317980134 | CraigCaplan | Caplan, Craig | C–SPAN | M | 6143 | 388.00 | 8.00 |
16061946 | kelmej | Mejdrich, Kellie | CQ Roll Call | F | 4146 | 340.00 | 29.00 |
15365623 | benjamin_oc | O’Connell, Benjamin | C–SPAN | M | 1455 | 322.00 | 11.00 |
906734342 | KimberlyRobinsn | Robinson, Kimberly S. | Bloomberg BNA | F | 7170 | 321.00 | 7.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 311.00 | 8.00 |
227790723 | RichardRubinDC | Rubin, Richard | Bloomberg News | M | 13015 | 305.00 | 41.00 |
103016675 | AaronMehta | Mehta, Aaron | Sightline Media Group | M | 11124 | 304.00 | 35.00 |
21810329 | sdonnan | Donnan, Shawn | Financial Times | M | 12311 | 304.00 | 7.00 |
90478926 | MikeSacksEsq | Sacks, Mike | Scripps Howard News Service | M | 9289 | 299.00 | 18.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 297.00 | 49.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 283.00 | 72.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 269.00 | 45.00 |
21696279 | brianbeutler | Beutler, Brian Alfred | New Republic | M | 74435 | 269.00 | 34.00 |
21212087 | Olivianuzzi | Nuzzi, Olivia | New York | F | 136276 | 243.00 | 25.00 |
journalist_reply_gender_summary(journalists_reply_df)
count | percentage | avg_replies | |
---|---|---|---|
index | |||
M | 33178 | 76.5% | 25.54 |
F | 10212 | 23.5% | 10.28 |
journalists_reply_summary_df[['reply_to_count']].describe()
reply_to_count | |
---|---|
count | 2,292.00 |
mean | 18.93 |
std | 81.76 |
min | 0.00 |
25% | 0.00 |
50% | 1.00 |
75% | 8.00 |
max | 1,980.00 |
female_journalists_reply_summary_df = journalists_reply_summary_df[journalists_reply_summary_df.gender == 'F']
female_journalists_reply_summary_df.to_csv('output/female_journalists_replied_to_by_journalists.csv')
female_journalists_reply_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
16061946 | kelmej | Mejdrich, Kellie | CQ Roll Call | F | 4146 | 340.00 | 29.00 |
906734342 | KimberlyRobinsn | Robinson, Kimberly S. | Bloomberg BNA | F | 7170 | 321.00 | 7.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 311.00 | 8.00 |
21212087 | Olivianuzzi | Nuzzi, Olivia | New York | F | 136276 | 243.00 | 25.00 |
83462293 | SarahMMimms | Mimms, Sarah | BuzzFeed | F | 6216 | 236.00 | 24.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 233.00 | 84.00 |
3372900155 | samtayrey | Reyes, Samantha | CNN | F | 10344 | 219.00 | 18.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 212.00 | 48.00 |
1132012321 | DaniellaMicaela | Diaz, Daniella | CNN | F | 14612 | 181.00 | 36.00 |
158072303 | ValerieInsinna | Insinna, Valerie | Defense News | F | 4572 | 175.00 | 20.00 |
36607254 | Oriana0214 | Pawlyk, Oriana | Military.com | F | 6397 | 174.00 | 21.00 |
96405362 | laurenonthehill | Camera, Lauren S. | U.S. News & World Report | F | 3396 | 162.00 | 6.00 |
16812908 | crousselle | Rousselle, Christine | Townhall | F | 5327 | 149.00 | 5.00 |
47758416 | marissaaevans | Evans, Marissa | Texas Tribune | F | 6850 | 137.00 | 1.00 |
45399148 | jeneps | Epstein, Jennifer | Bloomberg News | F | 61242 | 134.00 | 23.00 |
16434028 | gabbilevy | Levy, Gabrielle F. | U.S. News & World Report | F | 2209 | 132.00 | 4.00 |
14870670 | KateNocera | Nocera, Kate | BuzzFeed | F | 27714 | 116.00 | 36.00 |
18501487 | leighmunsil | Munsil, Leigh | CNN | F | 11059 | 107.00 | 30.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 104.00 | 12.00 |
116341480 | RosieGray | Gray, Rosie | The Atlantic | F | 96935 | 99.00 | 31.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 96.00 | 44.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 95.00 | 47.00 |
38855868 | brennawilliams | Williams, Brenna | CNN | F | 7299 | 93.00 | 22.00 |
273700859 | kpolantz | Polantz, Katelyn J. | National Law Journal | F | 2483 | 91.00 | 6.00 |
3273220608 | KatherineBScott | Scott, Katherine | Bloomberg Government | F | 1841 | 85.00 | 14.00 |
female_journalists_reply_summary_df[['reply_to_count']].describe()
reply_to_count | |
---|---|
count | 993.00 |
mean | 10.28 |
std | 31.00 |
min | 0.00 |
25% | 0.00 |
50% | 1.00 |
75% | 6.00 |
max | 340.00 |
male_journalists_reply_summary_df = journalists_reply_summary_df[journalists_reply_summary_df.gender == 'M']
male_journalists_reply_summary_df.to_csv('output/male_journalists_replied_to_by_journalists.csv')
male_journalists_reply_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 1,980.00 | 75.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 1,901.00 | 37.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 1,091.00 | 65.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 750.00 | 46.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 745.00 | 93.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 720.00 | 23.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 662.00 | 35.00 |
583821006 | jseldin | Seldin, Jeff | Voice of America | M | 5365 | 653.00 | 2.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 522.00 | 92.00 |
44951698 | amaxsmith | Smith, Max | WTOP Radio | M | 4726 | 498.00 | 6.00 |
225265639 | ddale8 | Dale, Daniel | Toronto Star | M | 180671 | 495.00 | 20.00 |
317980134 | CraigCaplan | Caplan, Craig | C–SPAN | M | 6143 | 388.00 | 8.00 |
15365623 | benjamin_oc | O’Connell, Benjamin | C–SPAN | M | 1455 | 322.00 | 11.00 |
227790723 | RichardRubinDC | Rubin, Richard | Bloomberg News | M | 13015 | 305.00 | 41.00 |
103016675 | AaronMehta | Mehta, Aaron | Sightline Media Group | M | 11124 | 304.00 | 35.00 |
21810329 | sdonnan | Donnan, Shawn | Financial Times | M | 12311 | 304.00 | 7.00 |
90478926 | MikeSacksEsq | Sacks, Mike | Scripps Howard News Service | M | 9289 | 299.00 | 18.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 297.00 | 49.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 283.00 | 72.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 269.00 | 45.00 |
21696279 | brianbeutler | Beutler, Brian Alfred | New Republic | M | 74435 | 269.00 | 34.00 |
190360266 | connorobrienNH | O’Brien, Connor | Politico | M | 6158 | 241.00 | 35.00 |
63717541 | phillyrich1 | Weinstein, Richard | C–SPAN | M | 3827 | 241.00 | 4.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 238.00 | 79.00 |
80111587 | JeffYoung | Young, Jeffrey | Huffington Post | M | 26497 | 238.00 | 31.00 |
male_journalists_reply_summary_df[['reply_to_count']].describe()
reply_to_count | |
---|---|
count | 1,299.00 |
mean | 25.54 |
std | 104.71 |
min | 0.00 |
25% | 0.00 |
50% | 1.00 |
75% | 11.00 |
max | 1,980.00 |
journalists_replied_to_by_female_summary_df = journalist_reply_summary(journalists_reply_df[journalists_reply_df.gender == 'F'])
journalists_replied_to_by_female_summary_df.to_csv('output/journalists_replied_to_by_female_journalists.csv')
journalists_replied_to_by_female_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
906734342 | KimberlyRobinsn | Robinson, Kimberly S. | Bloomberg BNA | F | 7170 | 313.00 | 2.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 305.00 | 3.00 |
16061946 | kelmej | Mejdrich, Kellie | CQ Roll Call | F | 4146 | 295.00 | 15.00 |
83462293 | SarahMMimms | Mimms, Sarah | BuzzFeed | F | 6216 | 195.00 | 7.00 |
21212087 | Olivianuzzi | Nuzzi, Olivia | New York | F | 136276 | 190.00 | 9.00 |
3372900155 | samtayrey | Reyes, Samantha | CNN | F | 10344 | 179.00 | 7.00 |
96405362 | laurenonthehill | Camera, Lauren S. | U.S. News & World Report | F | 3396 | 159.00 | 5.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 148.00 | 18.00 |
1132012321 | DaniellaMicaela | Diaz, Daniella | CNN | F | 14612 | 144.00 | 22.00 |
16812908 | crousselle | Rousselle, Christine | Townhall | F | 5327 | 144.00 | 3.00 |
47758416 | marissaaevans | Evans, Marissa | Texas Tribune | F | 6850 | 137.00 | 1.00 |
36607254 | Oriana0214 | Pawlyk, Oriana | Military.com | F | 6397 | 133.00 | 5.00 |
16434028 | gabbilevy | Levy, Gabrielle F. | U.S. News & World Report | F | 2209 | 130.00 | 2.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 108.00 | 36.00 |
45399148 | jeneps | Epstein, Jennifer | Bloomberg News | F | 61242 | 103.00 | 7.00 |
158072303 | ValerieInsinna | Insinna, Valerie | Defense News | F | 4572 | 97.00 | 8.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 97.00 | 5.00 |
18501487 | leighmunsil | Munsil, Leigh | CNN | F | 11059 | 88.00 | 13.00 |
273700859 | kpolantz | Polantz, Katelyn J. | National Law Journal | F | 2483 | 84.00 | 2.00 |
114670081 | rebleber | Leber, Rebecca J. | Mother Jones | F | 16467 | 79.00 | 3.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 78.00 | 30.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 78.00 | 20.00 |
116341480 | RosieGray | Gray, Rosie | The Atlantic | F | 96935 | 73.00 | 13.00 |
103016675 | AaronMehta | Mehta, Aaron | Sightline Media Group | M | 11124 | 72.00 | 10.00 |
48038024 | karentravers | Travers, Karen | ABC News | F | 17155 | 71.00 | 7.00 |
journalist_reply_gender_summary(journalists_reply_df[journalists_reply_df.gender == 'F'])
count | percentage | avg_replies | |
---|---|---|---|
index | |||
F | 7412 | 72.1% | 7.46 |
M | 2864 | 27.9% | 2.20 |
journalists_replied_to_by_male_summary_df = journalist_reply_summary(journalists_reply_df[journalists_reply_df.gender == 'M'])
journalists_replied_to_by_male_summary_df.to_csv('output/journalists_replied_to_by_male_journalists.csv')
journalists_replied_to_by_male_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 1,926.00 | 58.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 1,864.00 | 28.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 1,013.00 | 45.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 726.00 | 35.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 709.00 | 20.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 686.00 | 61.00 |
583821006 | jseldin | Seldin, Jeff | Voice of America | M | 5365 | 653.00 | 2.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 646.00 | 24.00 |
44951698 | amaxsmith | Smith, Max | WTOP Radio | M | 4726 | 495.00 | 4.00 |
225265639 | ddale8 | Dale, Daniel | Toronto Star | M | 180671 | 490.00 | 16.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 456.00 | 64.00 |
317980134 | CraigCaplan | Caplan, Craig | C–SPAN | M | 6143 | 388.00 | 8.00 |
15365623 | benjamin_oc | O’Connell, Benjamin | C–SPAN | M | 1455 | 318.00 | 8.00 |
21810329 | sdonnan | Donnan, Shawn | Financial Times | M | 12311 | 303.00 | 6.00 |
90478926 | MikeSacksEsq | Sacks, Mike | Scripps Howard News Service | M | 9289 | 294.00 | 13.00 |
227790723 | RichardRubinDC | Rubin, Richard | Bloomberg News | M | 13015 | 284.00 | 33.00 |
21696279 | brianbeutler | Beutler, Brian Alfred | New Republic | M | 74435 | 262.00 | 29.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 249.00 | 52.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 241.00 | 30.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 240.00 | 35.00 |
63717541 | phillyrich1 | Weinstein, Richard | C–SPAN | M | 3827 | 240.00 | 3.00 |
103016675 | AaronMehta | Mehta, Aaron | Sightline Media Group | M | 11124 | 232.00 | 25.00 |
26559241 | fordm | Ford, Matt S. | The Atlantic | M | 27571 | 232.00 | 15.00 |
437019753 | TimothyNoah1 | Noah, Timothy R. | Politico | M | 15090 | 231.00 | 12.00 |
23332846 | mattzap | Zapotosky, Matt | Washington Post | M | 56887 | 230.00 | 7.00 |
journalist_reply_gender_summary(journalists_reply_df[journalists_reply_df.gender == 'M'])
count | percentage | avg_replies | |
---|---|---|---|
index | |||
M | 30314 | 91.5% | 23.34 |
F | 2800 | 8.5% | 2.82 |
Users that are followed by beltway journalists
base_follower_to_followed_df = pd.read_csv('source_data/follower_to_followed.csv',
names=['follower_user_id', 'followed_user_id'],
dtype={'follower_user_id': np.str, 'followed_user_id': np.str})
base_follower_to_followed_df.drop_duplicates(inplace=True)
base_follower_to_followed_df.count()
follower_user_id 3417018 followed_user_id 3417018 dtype: int64
base_follower_to_followed_df.head()
follower_user_id | followed_user_id | |
---|---|---|
0 | 91156486 | 3092427779 |
1 | 91156486 | 36953109 |
2 | 91156486 | 424274008 |
3 | 91156486 | 779044378929168384 |
4 | 91156486 | 339834914 |
user_info_df.head()
name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | |
---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||
20711445 | Glinski, Nina | NaN | Freelance Reporter | F | 963 | 507 | 909 | Thu Feb 12 20:00:53 +0000 2009 | False | False |
258917371 | Enders, David | NaN | Journalist | M | 1444 | 484 | 6296 | Mon Feb 28 19:52:03 +0000 2011 | True | False |
297046834 | Barakat, Matthew | Associated Press | Northern Virginia Correspondent | M | 759 | 352 | 631 | Wed May 11 20:55:24 +0000 2011 | True | False |
455585786 | Atkins, Kimberly | Boston Herald | Chief Washington Reporter/Columnist | F | 2944 | 2691 | 6277 | Thu Jan 05 08:26:46 +0000 2012 | True | False |
42584840 | Vlahou, Toula | CQ Roll Call | Editor & Podcast Producer | F | 2703 | 201 | 6366 | Tue May 26 07:41:38 +0000 2009 | False | False |
# This will drop followers of journalists that have no tweets
follower_to_followed_df = base_follower_to_followed_df.join(user_summary_df['gender'], on='follower_user_id', how='inner')
follower_to_followed_df.count()
follower_user_id 3311406 followed_user_id 3311406 gender 3311406 dtype: int64
follower_to_followed_df.head()
follower_user_id | followed_user_id | gender | |
---|---|---|---|
261 | 15219888 | 3291076716 | F |
262 | 15219888 | 119175339 | F |
263 | 15219888 | 418837047 | F |
264 | 15219888 | 259817885 | F |
265 | 15219888 | 287263845 | F |
followed_screen_name_lookup_df = pd.read_csv('source_data/followed.csv',
names=['screen_name', 'user_id'],
dtype={'user_id': np.str}).set_index(['user_id'])
followed_screen_name_lookup_df.head()
screen_name | |
---|---|
user_id | |
17665874 | onlinehigh |
2389275799 | HLSPOLICY |
314728983 | Veolia_NA |
239409802 | fishingbuk |
522799320 | GoldsmithBev |
follower_to_journalist_followed_df = follower_to_followed_df.join(user_summary_df['gender'], how='inner', on='followed_user_id', rsuffix='_followed')
follower_to_journalist_followed_df.rename(columns = {'gender_followed': 'followed_gender'}, inplace=True)
follower_to_journalist_followed_df.count()
follower_user_id 280340 followed_user_id 280340 gender 280340 followed_gender 280340 dtype: int64
follower_to_journalist_followed_df.head()
follower_user_id | followed_user_id | gender | followed_gender | |
---|---|---|---|---|
287 | 15219888 | 46582653 | F | M |
21810 | 15780280 | 46582653 | M | M |
24153 | 14245722 | 46582653 | M | M |
40694 | 37865281 | 46582653 | F | M |
66585 | 165204211 | 46582653 | M | M |
# Gender of beltway journalists followed by beltway journalists
def journalist_followed_gender_summary(follower_to_followed_df):
gender_summary_df = pd.DataFrame({'count':follower_to_followed_df.followed_gender.value_counts(),
'percentage': follower_to_followed_df.followed_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
gender_summary_df.reset_index(inplace=True)
gender_summary_df['avg_followed'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1)
gender_summary_df.set_index('index', inplace=True, drop=True)
return gender_summary_df
def journalist_following_summary(follower_to_followed_df):
# Following count
following_count_df = pd.DataFrame(follower_to_followed_df.followed_user_id.value_counts().rename('journalist_follower_count'))
# Join with user summary
journalist_following_summary_df = user_summary_df.join(following_count_df)
journalist_following_summary_df.fillna(0, inplace=True)
journalist_following_summary_df = journalist_following_summary_df.sort_values(['journalist_follower_count', 'followers_count'], ascending=False)
return journalist_following_summary_df
# Gender of top journalists followed by beltway journalists
def top_journalist_followed_gender_summary(followed_summary_df, head=100):
top_followed_summary_df = followed_summary_df.head(head)
return pd.DataFrame({'count': top_followed_summary_df.gender.value_counts(),
'percentage': top_followed_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
# Fields for displaying journalist mention summaries
journalist_following_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'journalist_follower_count']
# Following count
all_followed_df = pd.DataFrame(follower_to_followed_df.followed_user_id.value_counts().rename('following_count')).join(followed_screen_name_lookup_df)
all_followed_df.to_csv('output/all_followed_by_journalists.csv')
all_followed_df.head(25)
following_count | screen_name | |
---|---|---|
813286 | 1671 | BarackObama |
51241574 | 1629 | AP |
25073877 | 1613 | realDonaldTrump |
807095 | 1581 | nytimes |
2467791 | 1532 | washingtonpost |
1339835893 | 1531 | HillaryClinton |
818927131883356161 | 1522 | PressSec |
822215673812119553 | 1507 | WhiteHouse |
822215679726100480 | 1488 | POTUS |
9300262 | 1457 | politico |
30313925 | 1402 | ObamaWhiteHouse |
14246001 | 1384 | mikeallen |
93069110 | 1368 | maggieNYT |
14529929 | 1337 | jaketapper |
428333 | 1289 | cnnbrk |
3108351 | 1279 | WSJ |
1536791610 | 1279 | POTUS44 |
50325797 | 1258 | chucktodd |
113420831 | 1258 | PressSec44 |
16017475 | 1234 | NateSilver538 |
18622869 | 1231 | ezraklein |
86129724 | 1173 | costareports |
1652541 | 1144 | Reuters |
1330457336 | 1128 | billclinton |
5392522 | 1124 | NPR |
follower_to_journalist_followed_summary_df = journalist_following_summary(follower_to_journalist_followed_df)
follower_to_journalist_followed_summary_df.to_csv('output/journalists_followed_by_journalists.csv')
follower_to_journalist_followed_summary_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,337.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 1,258.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,116.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 1,107.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 1,106.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 1,082.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 1,032.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 977.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 973.00 |
85131054 | jeffzeleny | Zeleny, Jeff | CNN | M | 244114 | 970.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 915.00 |
89820928 | mitchellreports | Mitchell, Andrea | NBC News | F | 1388543 | 909.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 892.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 884.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 880.00 |
130945778 | mollyesque | Ball, Molly | The Atlantic | F | 116857 | 877.00 |
46176168 | MajorCBS | Garrett, Major | CBS News | M | 178640 | 872.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 868.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 866.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 860.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 860.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 856.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 830.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 788.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 783.00 |
journalist_followed_gender_summary(follower_to_journalist_followed_df)
count | percentage | avg_followed | |
---|---|---|---|
index | |||
M | 174283 | 62.2% | 134.17 |
F | 106057 | 37.8% | 106.80 |
follower_to_journalist_followed_summary_df[['journalist_follower_count']].describe()
journalist_follower_count | |
---|---|
count | 2,292.00 |
mean | 122.31 |
std | 161.53 |
min | 0.00 |
25% | 26.00 |
50% | 64.00 |
75% | 145.00 |
max | 1,337.00 |
follower_to_female_journalist_followed_df = follower_to_journalist_followed_summary_df[follower_to_journalist_followed_summary_df.gender == 'F']
follower_to_female_journalist_followed_df.to_csv('output/female_journalists_followed_by_journalists.csv')
follower_to_female_journalist_followed_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
89820928 | mitchellreports | Mitchell, Andrea | NBC News | F | 1388543 | 909.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 884.00 |
130945778 | mollyesque | Ball, Molly | The Atlantic | F | 116857 | 877.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 860.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 777.00 |
28181835 | jpaceDC | Pace, Julie | Associated Press | F | 46017 | 738.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 679.00 |
21307076 | SusanPage | Page, Susan | USA Today | F | 48675 | 670.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 664.00 |
45399148 | jeneps | Epstein, Jennifer | Bloomberg News | F | 61242 | 631.00 |
224320485 | KellyO | O’Donnell, Kelly | NBC News | F | 148476 | 630.00 |
20776497 | BFischerMartin | Fischer Martin, Betsy | Bloomberg News | F | 50890 | 609.00 |
77032777 | apalmerdc | Palmer, Anna A. | Politico | F | 30523 | 591.00 |
116341480 | RosieGray | Gray, Rosie | The Atlantic | F | 96935 | 589.00 |
237477771 | juliehdavis | Davis, Julie | New York Times | F | 49821 | 570.00 |
58869089 | margarettalev | Talev, Margaret | Bloomberg News | F | 19588 | 569.00 |
14870670 | KateNocera | Nocera, Kate | BuzzFeed | F | 27714 | 567.00 |
46817943 | brikeilarcnn | Keilar, Brianna | CNN | F | 105276 | 557.00 |
22772264 | carolelee | Lee, Carol | Wall Street Journal / Dow Jones | F | 31840 | 552.00 |
15159913 | JFKucinich | Kucinich, Jacqueline | Daily Beast | F | 31210 | 549.00 |
297532865 | kwelkernbc | Welker, Kristen | NBC News | F | 99234 | 537.00 |
15727317 | aterkel | Terkel, Amanda | Huffington Post | F | 78736 | 527.00 |
17881467 | rebeccagberg | Berg, Rebecca | RealClearPolitics | F | 48798 | 516.00 |
151444950 | DaviSusan | Davis, Susan | National Public Radio | F | 27297 | 506.00 |
27055034 | SabrinaSiddiqui | Siddiqui, Sabrina | Guardian US | F | 53835 | 474.00 |
follower_to_female_journalist_followed_df[['journalist_follower_count']].describe()
journalist_follower_count | |
---|---|
count | 993.00 |
mean | 106.80 |
std | 131.81 |
min | 0.00 |
25% | 24.00 |
50% | 59.00 |
75% | 131.00 |
max | 909.00 |
follower_to_male_journalist_followed_df = follower_to_journalist_followed_summary_df[follower_to_journalist_followed_summary_df.gender == 'M']
follower_to_male_journalist_followed_df.to_csv('output/male_journalists_followed_by_journalists.csv')
follower_to_male_journalist_followed_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,337.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 1,258.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,116.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 1,107.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 1,106.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 1,082.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 1,032.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 977.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 973.00 |
85131054 | jeffzeleny | Zeleny, Jeff | CNN | M | 244114 | 970.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 915.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 892.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 880.00 |
46176168 | MajorCBS | Garrett, Major | CBS News | M | 178640 | 872.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 868.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 866.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 860.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 856.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 830.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 788.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 783.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 781.00 |
18172905 | rickklein | Klein, Richard | ABC News | M | 109170 | 737.00 |
21768766 | jonathanweisman | Weisman, Jonathan | New York Times | M | 57549 | 728.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 728.00 |
follower_to_male_journalist_followed_df[['journalist_follower_count']].describe()
journalist_follower_count | |
---|---|
count | 1,299.00 |
mean | 134.17 |
std | 180.14 |
min | 0.00 |
25% | 28.00 |
50% | 67.00 |
75% | 156.00 |
max | 1,337.00 |
female_follower_to_journalist_followed_df = journalist_following_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'F'])
female_follower_to_journalist_followed_df.to_csv('output/journalists_followed_by_female_journalists.csv')
female_follower_to_journalist_followed_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 619.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 569.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 505.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 490.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 484.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 474.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 445.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 444.00 |
89820928 | mitchellreports | Mitchell, Andrea | NBC News | F | 1388543 | 441.00 |
85131054 | jeffzeleny | Zeleny, Jeff | CNN | M | 244114 | 435.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 434.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 430.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 420.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 402.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 398.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 397.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 394.00 |
46176168 | MajorCBS | Garrett, Major | CBS News | M | 178640 | 390.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 389.00 |
130945778 | mollyesque | Ball, Molly | The Atlantic | F | 116857 | 386.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 382.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 379.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 366.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 354.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 339.00 |
journalist_followed_gender_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'F'])
count | percentage | avg_followed | |
---|---|---|---|
index | |||
M | 73950 | 62.0% | 56.93 |
F | 45300 | 38.0% | 45.62 |
male_follower_to_journalist_followed_df = journalist_following_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'M'])
male_follower_to_journalist_followed_df.to_csv('output/journalists_followed_by_male_journalists.csv')
male_follower_to_journalist_followed_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 718.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 689.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 626.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 622.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 608.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 602.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 587.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 543.00 |
85131054 | jeffzeleny | Zeleny, Jeff | CNN | M | 244114 | 535.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 529.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 510.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 495.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 494.00 |
130945778 | mollyesque | Ball, Molly | The Atlantic | F | 116857 | 491.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 482.00 |
46176168 | MajorCBS | Garrett, Major | CBS News | M | 178640 | 482.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 477.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 474.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 469.00 |
89820928 | mitchellreports | Mitchell, Andrea | NBC News | F | 1388543 | 468.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 464.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 458.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 454.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 446.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 441.00 |
journalist_followed_gender_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'M'])
count | percentage | avg_followed | |
---|---|---|---|
index | |||
M | 100333 | 62.3% | 77.24 |
F | 60757 | 37.7% | 61.19 |