%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format
def tweet_transform(tweet):
return {
'tweet_id': tweet['id_str'],
'tweet_created_at': date_parse(tweet['created_at']),
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'tweet_type': tweet_type(tweet)
}
tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'tweet_type'], dedupe_columns=['tweet_id'])
tweet_df.count()
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000 DEBUG:root:Loaded 800000
tweet_id 817136 user_id 817136 screen_name 817136 tweet_created_at 817136 tweet_type 817136 dtype: int64
tweet_df.head()
tweet_id | user_id | screen_name | tweet_created_at | tweet_type | |
---|---|---|---|---|---|
0 | 872631046088601600 | 327862439 | jonathanvswan | 2017-06-08 01:47:08+00:00 | retweet |
1 | 872610483647516673 | 327862439 | jonathanvswan | 2017-06-08 00:25:26+00:00 | retweet |
2 | 872609618626826240 | 327862439 | jonathanvswan | 2017-06-08 00:22:00+00:00 | retweet |
3 | 872605974699311104 | 327862439 | jonathanvswan | 2017-06-08 00:07:31+00:00 | retweet |
4 | 872603191518646276 | 327862439 | jonathanvswan | 2017-06-07 23:56:27+00:00 | retweet |
tweet_df.tweet_created_at.min()
Timestamp('2017-06-01 04:00:01+0000', tz='UTC')
tweet_df.tweet_created_at.max()
Timestamp('2017-08-01 03:59:58+0000', tz='UTC')
pd.DataFrame({'count':tweet_df.tweet_type.value_counts(),
'percentage':tweet_df.tweet_type.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
count | percentage | |
---|---|---|
retweet | 345266 | 42.3% |
original | 233926 | 28.6% |
reply | 126254 | 15.5% |
quote | 111690 | 13.7% |
This comes from the following sources:
Thus, the tweeter data should include tweet and user info data only from users in the user lookup.
user_lookup_filepaths = ('lookups/senate_press_lookup.csv',
'lookups/periodical_press_lookup.csv',
'lookups/radio_and_television_lookup.csv')
user_lookup_df = pd.concat((pd.read_csv(user_lookup_filepath, usecols=['Uid', 'Token'], dtype={'Uid': str}) for user_lookup_filepath in user_lookup_filepaths))
user_lookup_df.set_index('Uid', inplace=True)
user_lookup_df.rename(columns={'Token': 'screen_name'}, inplace=True)
user_lookup_df.index.names = ['user_id']
# Some users may be in multiple lists, so need to drop duplicates
user_lookup_df = user_lookup_df[~user_lookup_df.index.duplicated()]
user_lookup_df.count()
screen_name 2487 dtype: int64
user_lookup_df.head()
screen_name | |
---|---|
user_id | |
23455653 | abettel |
33919343 | AshleyRParker |
18580432 | b_fung |
399225358 | b_muzz |
18834692 | becca_milfeld |
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet
user_tweet_count_df.count()
tweet_type original 2292 quote 2292 reply 2292 retweet 2292 tweets_in_dataset 2292 dtype: int64
user_tweet_count_df.head()
tweet_type | original | quote | reply | retweet | tweets_in_dataset |
---|---|---|---|---|---|
user_id | |||||
1001991865 | 13.00 | 3.00 | 1.00 | 31.00 | 48.00 |
1002229862 | 48.00 | 20.00 | 3.00 | 118.00 | 189.00 |
100270054 | 1.00 | 0.00 | 0.00 | 0.00 | 1.00 |
100802089 | 4.00 | 7.00 | 12.00 | 17.00 | 40.00 |
100860790 | 102.00 | 26.00 | 4.00 | 166.00 | 298.00 |
user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['user_id', 'name', 'organization', 'position',
'gender', 'followers_count', 'following_count', 'tweet_count',
'user_created_at', 'verified', 'protected'],
dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()
name 2506 organization 2477 position 2503 gender 2505 followers_count 2506 following_count 2506 tweet_count 2506 user_created_at 2506 verified 2506 protected 2506 dtype: int64
user_info_df.head()
name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | |
---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||
20711445 | Glinski, Nina | NaN | Freelance Reporter | F | 963 | 507 | 909 | Thu Feb 12 20:00:53 +0000 2009 | False | False |
258917371 | Enders, David | NaN | Journalist | M | 1444 | 484 | 6296 | Mon Feb 28 19:52:03 +0000 2011 | True | False |
297046834 | Barakat, Matthew | Associated Press | Northern Virginia Correspondent | M | 759 | 352 | 631 | Wed May 11 20:55:24 +0000 2011 | True | False |
455585786 | Atkins, Kimberly | Boston Herald | Chief Washington Reporter/Columnist | F | 2944 | 2691 | 6277 | Thu Jan 05 08:26:46 +0000 2012 | True | False |
42584840 | Vlahou, Toula | CQ Roll Call | Editor & Podcast Producer | F | 2703 | 201 | 6366 | Tue May 26 07:41:38 +0000 2009 | False | False |
user_summary_df = user_lookup_df.join((user_info_df, user_tweet_count_df), how='left')
# Fill Nans
user_summary_df['organization'].fillna('', inplace=True)
user_summary_df['original'].fillna(0, inplace=True)
user_summary_df['quote'].fillna(0, inplace=True)
user_summary_df['reply'].fillna(0, inplace=True)
user_summary_df['retweet'].fillna(0, inplace=True)
user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)
user_summary_df.count()
screen_name 2487 name 2487 organization 2487 position 2484 gender 2486 followers_count 2487 following_count 2487 tweet_count 2487 user_created_at 2487 verified 2487 protected 2487 original 2487 quote 2487 reply 2487 retweet 2487 tweets_in_dataset 2487 dtype: int64
user_summary_df.head()
screen_name | name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | original | quote | reply | retweet | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||||||||
23455653 | abettel | Bettelheim, Adriel | Politico | Health Care Editor | F | 2664 | 1055 | 15990 | Mon Mar 09 16:32:20 +0000 2009 | True | False | 289.00 | 12.00 | 6.00 | 52.00 | 359.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | White House Reporter | F | 122382 | 2342 | 12433 | Tue Apr 21 14:28:57 +0000 2009 | True | False | 172.00 | 67.00 | 11.00 | 120.00 | 370.00 |
18580432 | b_fung | Fung, Brian | Washington Post | Tech Reporter | M | 16558 | 2062 | 44799 | Sat Jan 03 15:15:57 +0000 2009 | True | False | 257.00 | 85.00 | 205.00 | 82.00 | 629.00 |
399225358 | b_muzz | Murray, Brendan | Bloomberg News | Managing Editor, U.S. Economy | M | 624 | 382 | 360 | Thu Oct 27 05:34:05 +0000 2011 | True | False | 3.00 | 0.00 | 0.00 | 5.00 | 8.00 |
18834692 | becca_milfeld | Milfeld, Becca | Agence France-Presse | English Desk Editor and Journalist | F | 483 | 993 | 1484 | Sat Jan 10 13:58:43 +0000 2009 | False | False | 3.00 | 14.00 | 0.00 | 7.00 | 24.00 |
user_summary_df[user_summary_df.tweets_in_dataset == 0].count()
screen_name 195 name 195 organization 195 position 195 gender 194 followers_count 195 following_count 195 tweet_count 195 user_created_at 195 verified 195 protected 195 original 195 quote 195 reply 195 retweet 195 tweets_in_dataset 195 dtype: int64
user_summary_df = user_summary_df[user_summary_df.tweets_in_dataset != 0]
user_summary_df.count()
screen_name 2292 name 2292 organization 2292 position 2289 gender 2292 followers_count 2292 following_count 2292 tweet_count 2292 user_created_at 2292 verified 2292 protected 2292 original 2292 quote 2292 reply 2292 retweet 2292 tweets_in_dataset 2292 dtype: int64
pd.DataFrame({'count':user_summary_df.gender.value_counts(), 'percentage':user_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
count | percentage | |
---|---|---|
M | 1299 | 56.7% |
F | 993 | 43.3% |
user_summary_df[['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()
followers_count | following_count | tweet_count | original | quote | reply | retweet | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
count | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 |
mean | 16,467.62 | 1,444.83 | 9,619.69 | 102.06 | 48.73 | 55.08 | 150.64 | 356.52 |
std | 91,886.90 | 3,003.00 | 16,618.09 | 169.43 | 135.90 | 249.18 | 585.08 | 833.76 |
min | 6.00 | 0.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
25% | 831.75 | 505.75 | 1,449.50 | 10.00 | 1.00 | 1.00 | 8.00 | 32.00 |
50% | 2,419.50 | 998.50 | 4,211.50 | 41.00 | 9.00 | 5.00 | 39.00 | 122.00 |
75% | 7,348.75 | 1,713.50 | 10,817.25 | 124.25 | 43.00 | 30.00 | 129.00 | 375.00 |
max | 2,176,578.00 | 96,194.00 | 208,763.00 | 2,693.00 | 3,069.00 | 9,033.00 | 21,524.00 | 21,547.00 |
user_summary_df[user_summary_df.gender == 'F'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()
followers_count | following_count | tweet_count | original | quote | reply | retweet | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
count | 993.00 | 993.00 | 993.00 | 993.00 | 993.00 | 993.00 | 993.00 | 993.00 |
mean | 11,609.53 | 1,314.07 | 7,498.74 | 83.84 | 39.27 | 32.06 | 135.55 | 290.72 |
std | 65,563.72 | 1,250.56 | 11,312.72 | 124.86 | 135.05 | 94.73 | 724.92 | 833.07 |
min | 6.00 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
25% | 825.00 | 567.00 | 1,393.00 | 8.00 | 1.00 | 1.00 | 9.00 | 32.00 |
50% | 2,327.00 | 1,034.00 | 4,055.00 | 39.00 | 9.00 | 4.00 | 37.00 | 111.00 |
75% | 6,340.00 | 1,659.00 | 8,983.00 | 111.00 | 33.00 | 21.00 | 115.00 | 314.00 |
max | 1,388,543.00 | 18,197.00 | 118,713.00 | 1,440.00 | 3,069.00 | 1,458.00 | 21,524.00 | 21,547.00 |
user_summary_df[user_summary_df.gender == 'M'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()
followers_count | following_count | tweet_count | original | quote | reply | retweet | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
count | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 |
mean | 20,181.31 | 1,544.78 | 11,241.02 | 115.99 | 55.96 | 72.69 | 162.17 | 406.81 |
std | 107,635.37 | 3,833.89 | 19,584.46 | 195.72 | 136.16 | 319.41 | 449.75 | 831.10 |
min | 10.00 | 0.00 | 5.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
25% | 857.50 | 472.00 | 1,477.00 | 12.00 | 0.00 | 1.00 | 6.00 | 33.00 |
50% | 2,498.00 | 953.00 | 4,401.00 | 44.00 | 9.00 | 6.00 | 40.00 | 131.00 |
75% | 8,341.50 | 1,763.00 | 12,584.50 | 140.00 | 50.50 | 38.50 | 142.00 | 428.00 |
max | 2,176,578.00 | 96,194.00 | 208,763.00 | 2,693.00 | 1,955.00 | 9,033.00 | 7,528.00 | 11,432.00 |
pd.DataFrame({'count':user_summary_df.verified.value_counts(), 'percentage':user_summary_df.verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
count | percentage | |
---|---|---|
True | 1240 | 54.1% |
False | 1052 | 45.9% |
pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
count | percentage | |
---|---|---|
True | 512 | 51.6% |
False | 481 | 48.4% |
pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
count | percentage | |
---|---|---|
True | 728 | 56.0% |
False | 571 | 44.0% |
Including original tweets only
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format
# Simply the tweet on load
def mention_transform(tweet):
mentions = []
if tweet_type(tweet) == 'original':
for mention in tweet.get('entities', {}).get('user_mentions', []):
mentions.append({
'tweet_id': tweet['id_str'],
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'mention_user_id': mention['id_str'],
'mention_screen_name': mention['screen_name'],
'tweet_created_at': date_parse(tweet['created_at'])
})
return mentions
base_mention_df = load_tweet_df(mention_transform, ['tweet_id', 'user_id', 'screen_name', 'mention_user_id',
'mention_screen_name', 'tweet_created_at'],
dedupe_columns=['tweet_id', 'mention_user_id'])
base_mention_df.count()
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000 DEBUG:root:Loaded 800000
tweet_id 118210 user_id 118210 screen_name 118210 mention_user_id 118210 mention_screen_name 118210 tweet_created_at 118210 dtype: int64
base_mention_df.head()
tweet_id | user_id | screen_name | mention_user_id | mention_screen_name | tweet_created_at | |
---|---|---|---|---|---|---|
0 | 872522339962978307 | 327862439 | jonathanvswan | 800707492346925056 | axios | 2017-06-07 18:35:11+00:00 |
1 | 872484939530461184 | 327862439 | jonathanvswan | 17494010 | SenSchumer | 2017-06-07 16:06:34+00:00 |
2 | 872475140575170562 | 327862439 | jonathanvswan | 2836421 | MSNBC | 2017-06-07 15:27:37+00:00 |
3 | 872475140575170562 | 327862439 | jonathanvswan | 800707492346925056 | axios | 2017-06-07 15:27:37+00:00 |
4 | 872459457946673154 | 327862439 | jonathanvswan | 800707492346925056 | axios | 2017-06-07 14:25:18+00:00 |
mention_df = base_mention_df.join(user_summary_df['gender'], on='user_id')
mention_df.count()
tweet_id 118210 user_id 118210 screen_name 118210 mention_user_id 118210 mention_screen_name 118210 tweet_created_at 118210 gender 118210 dtype: int64
mention_df['tweet_id'].unique().size
84942
mention_df['mention_user_id'].unique().size
17730
journalists_mention_df = mention_df.join(user_summary_df['gender'], how='inner', on='mention_user_id', rsuffix='_mention')
journalists_mention_df.rename(columns = {'gender_mention': 'mention_gender'}, inplace=True)
journalists_mention_df.count()
tweet_id 14298 user_id 14298 screen_name 14298 mention_user_id 14298 mention_screen_name 14298 tweet_created_at 14298 gender 14298 mention_gender 14298 dtype: int64
journalists_mention_df.head()
tweet_id | user_id | screen_name | mention_user_id | mention_screen_name | tweet_created_at | gender | mention_gender | |
---|---|---|---|---|---|---|---|---|
16 | 870408075878027268 | 327862439 | jonathanvswan | 16031927 | greta | 2017-06-01 22:33:51+00:00 | M | F |
283 | 872581449861541893 | 19847765 | sahilkapur | 16031927 | greta | 2017-06-07 22:30:04+00:00 | M | F |
2202 | 872578055910371328 | 21252618 | JakeSherman | 16031927 | greta | 2017-06-07 22:16:34+00:00 | M | F |
15977 | 880841069243629568 | 70511174 | Hadas_Gold | 16031927 | greta | 2017-06-30 17:30:50+00:00 | F | F |
17258 | 880183952018886661 | 90077282 | politicoalex | 16031927 | greta | 2017-06-28 21:59:41+00:00 | M | F |
# Gender of beltway journalists mentioned by beltway journalists
def journalist_mention_gender_summary(mention_df):
return pd.DataFrame({'count': mention_df.mention_gender.value_counts(),
'percentage': mention_df.mention_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
def journalist_mention_summary(mention_df):
# Mention count
mention_count_df = pd.DataFrame(mention_df.mention_user_id.value_counts().rename('mention_count'))
# Mentioning users. That is, the number of unique users mentioning each user.
mention_user_id_per_user_df = mention_df[['mention_user_id', 'user_id']].drop_duplicates()
mentioning_user_count_df = pd.DataFrame(mention_user_id_per_user_df.groupby('mention_user_id').size(), columns=['mentioning_count'])
mentioning_user_count_df.index.name = 'user_id'
# Join with user summary
journalist_mention_summary_df = user_summary_df.join([mention_count_df, mentioning_user_count_df])
journalist_mention_summary_df.fillna(0, inplace=True)
journalist_mention_summary_df = journalist_mention_summary_df.sort_values(['mention_count', 'mentioning_count', 'followers_count'], ascending=False)
return journalist_mention_summary_df
# Gender of top journalists mentioned by beltway journalists
def top_journalist_mention_gender_summary(mention_summary_df, mentioning_count_threshold=0, head=100):
top_mention_summary_df = mention_summary_df[mention_summary_df.mentioning_count > mentioning_count_threshold].head(head)
return pd.DataFrame({'count': top_mention_summary_df.gender.value_counts(),
'percentage': top_mention_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
# Fields for displaying journalist mention summaries
journalist_mention_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'mention_count', 'mentioning_count']
Note that for each of these, the complete list is being written to CSV in the output directory.
original_tweets_by_gender_df = user_summary_df[['gender', 'original']].groupby('gender').sum()
original_tweets_by_gender_df['percentage'] = original_tweets_by_gender_df.original.div(user_summary_df.original.sum()).mul(100).round(1).astype(str) + '%'
original_tweets_by_gender_df
original | percentage | |
---|---|---|
gender | ||
F | 83,251.00 | 35.6% |
M | 150,675.00 | 64.4% |
user_summary_df[['screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'original', 'tweets_in_dataset']].sort_values(['original'], ascending=False).head(25)
screen_name | name | organization | gender | followers_count | tweet_count | original | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
user_id | ||||||||
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 61461 | 2,693.00 | 2,693.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 115132 | 1,858.00 | 2,089.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 92203 | 1,534.00 | 5,187.00 |
19580890 | LeeCamp | Camp, Lee | RTTV America | M | 67601 | 52051 | 1,517.00 | 3,708.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 100803 | 1,440.00 | 8,196.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 49967 | 1,332.00 | 4,414.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 148143 | 1,316.00 | 5,078.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 161148 | 1,271.00 | 2,106.00 |
36246939 | malbertnews | Albert, Mark | The Voyage Report | M | 3575 | 28230 | 1,078.00 | 1,151.00 |
117467779 | palbergo | Albergo, Paul F. | Bloomberg BNA | M | 1191 | 18083 | 1,043.00 | 1,236.00 |
102171691 | rlocker12 | Locker, Ray | USA Today | M | 3665 | 41194 | 1,038.00 | 2,496.00 |
15486163 | SimonMarksFSN | Marks, Simon | Feature Story News | M | 7767 | 41541 | 984.00 | 3,432.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 142150 | 972.00 | 3,983.00 |
190360266 | connorobrienNH | O’Brien, Connor | Politico | M | 6158 | 17242 | 954.00 | 1,944.00 |
16031927 | greta | Van Susteren, Greta | MSNBC | F | 1186850 | 116645 | 907.00 | 4,792.00 |
300497193 | tackettdc | Tackett, R. Michael | New York Times | M | 16857 | 38620 | 896.00 | 1,041.00 |
191964162 | SamLitzinger | Litzinger, Sam | CBS News | M | 2329 | 95236 | 891.00 | 7,537.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 42497 | 885.00 | 3,960.00 |
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 208763 | 871.00 | 11,432.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 78015 | 846.00 | 6,377.00 |
27882000 | jamiedupree | Dupree, Jamie | Cox Broadcasting | M | 140848 | 46181 | 841.00 | 2,108.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 27294 | 836.00 | 1,673.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 104613 | 824.00 | 4,907.00 |
63149389 | hbwx | Bernstein, Howard | WUSA–TV | M | 8337 | 48025 | 822.00 | 1,604.00 |
13262862 | HowardMortman | Mortman, Howard | C–SPAN | M | 6211 | 38406 | 819.00 | 1,289.00 |
This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely.
# Mention count
mention_count_screen_name_df = pd.DataFrame(mention_df.mention_screen_name.value_counts().rename('mention_count'))
# Count of mentioning users
mention_user_id_per_user_screen_name_df = mention_df[['mention_screen_name', 'user_id']].drop_duplicates()
mentioning_count_screen_name_df = pd.DataFrame(mention_user_id_per_user_screen_name_df.groupby('mention_screen_name').size(), columns=['mentioning_count'])
mentioning_count_screen_name_df.index.name = 'screen_name'
all_mentioned_df = mention_count_screen_name_df.join(mentioning_count_screen_name_df)
all_mentioned_df.to_csv('output/all_mentioned_by_journalists.csv')
all_mentioned_df.head(25)
mention_count | mentioning_count | |
---|---|---|
realDonaldTrump | 2876 | 452 |
POTUS | 2265 | 253 |
wusa9 | 2111 | 41 |
AP | 1948 | 143 |
USATODAY | 1235 | 105 |
nbcwashington | 1230 | 70 |
WSJ | 1227 | 152 |
dcexaminer | 1034 | 53 |
SHSanders45 | 927 | 148 |
nytimes | 829 | 289 |
BloombergBNA | 759 | 45 |
politico | 747 | 181 |
SpeakerRyan | 700 | 181 |
Scaramucci | 657 | 198 |
PressSec | 654 | 178 |
CNN | 628 | 186 |
ABC7News | 604 | 24 |
SenJohnMcCain | 599 | 231 |
WTOP | 529 | 43 |
BloombergLaw | 517 | 15 |
VP | 506 | 140 |
SteveScalise | 505 | 150 |
MSNBC | 486 | 92 |
Reuters | 483 | 84 |
bpolitics | 432 | 69 |
all_mentioned_df.sort_values(['mentioning_count', 'mention_count'], ascending=False).head(25)
mention_count | mentioning_count | |
---|---|---|
realDonaldTrump | 2876 | 452 |
nytimes | 829 | 289 |
POTUS | 2265 | 253 |
SenJohnMcCain | 599 | 231 |
Scaramucci | 657 | 198 |
CNN | 628 | 186 |
politico | 747 | 181 |
SpeakerRyan | 700 | 181 |
PressSec | 654 | 178 |
washingtonpost | 413 | 154 |
WSJ | 1227 | 152 |
SteveScalise | 505 | 150 |
SHSanders45 | 927 | 148 |
AP | 1948 | 143 |
VP | 506 | 140 |
SenateMajLdr | 412 | 120 |
DonaldJTrumpJr | 199 | 110 |
RandPaul | 206 | 107 |
USATODAY | 1235 | 105 |
LindseyGrahamSC | 253 | 105 |
SenSchumer | 265 | 97 |
NancyPelosi | 266 | 95 |
MSNBC | 486 | 92 |
CNNPolitics | 329 | 91 |
MarkWarner | 204 | 89 |
journalists_mention_summary_df = journalist_mention_summary(journalists_mention_df)
journalists_mention_summary_df.to_csv('output/journalists_mentioned_by_journalists.csv')
journalists_mention_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
325050734 | AllysonRaeWx | Banks, Allyson | WUSA–TV | F | 6918 | 330.00 | 7.00 |
28496589 | TenaciousTopper | Shutt, Charles | WUSA–TV | M | 15868 | 239.00 | 13.00 |
63149389 | hbwx | Bernstein, Howard | WUSA–TV | M | 8337 | 235.00 | 10.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 212.00 | 46.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 200.00 | 31.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 143.00 | 41.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 127.00 | 51.00 |
169586280 | WaPoSean | Sullivan, Sean | Washington Post | M | 22860 | 117.00 | 20.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 116.00 | 47.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 115.00 | 55.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 109.00 | 22.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 107.00 | 43.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 106.00 | 42.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 105.00 | 27.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 104.00 | 40.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 100.00 | 31.00 |
9126752 | reporterjoe | Gould, Joseph M. | Sightline Media Group | M | 4702 | 98.00 | 16.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 95.00 | 43.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 87.00 | 14.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 84.00 | 41.00 |
26632935 | HopeSeck | Hodge Seck, Hope | Military.com | F | 4584 | 83.00 | 3.00 |
48802204 | HardballChris | Matthews, Chris | NBC News | M | 718330 | 80.00 | 9.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 78.00 | 37.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 78.00 | 27.00 |
24439201 | jameshohmann | Hohmann, James P. | Washington Post | M | 38708 | 78.00 | 27.00 |
journalists_mention_summary_df[journalist_mention_summary_fields].sort_values(['mentioning_count', 'mention_count'], ascending=False).head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 115.00 | 55.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 127.00 | 51.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 116.00 | 47.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 212.00 | 46.00 |
112526560 | kenvogel | Vogel, Kenneth P. | Politico | M | 53894 | 67.00 | 45.00 |
18227519 | morningmika | Brzezinski, Mika | MSNBC | F | 653031 | 70.00 | 44.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 107.00 | 43.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 95.00 | 43.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 106.00 | 42.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 143.00 | 41.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 84.00 | 41.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 104.00 | 40.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 61.00 | 38.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 78.00 | 37.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 75.00 | 37.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 43.00 | 32.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 200.00 | 31.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 100.00 | 31.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 40.00 | 31.00 |
71294756 | wolfblitzer | Blitzer, Wolf | CNN | M | 1281914 | 56.00 | 30.00 |
28181835 | jpaceDC | Pace, Julie | Associated Press | F | 46017 | 52.00 | 30.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 67.00 | 29.00 |
16031927 | greta | Van Susteren, Greta | MSNBC | F | 1186850 | 37.00 | 28.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 105.00 | 27.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 78.00 | 27.00 |
journalist_mention_gender_summary(journalists_mention_df)
count | percentage | |
---|---|---|
M | 8298 | 58.0% |
F | 6000 | 42.0% |
journalists_mention_summary_df[['mention_count']].describe()
mention_count | |
---|---|
count | 2,292.00 |
mean | 6.24 |
std | 17.59 |
min | 0.00 |
25% | 0.00 |
50% | 1.00 |
75% | 5.00 |
max | 330.00 |
female_journalists_mention_summary_df = journalists_mention_summary_df[journalists_mention_summary_df.gender == 'F']
female_journalists_mention_summary_df.to_csv('output/female_journalists_mentioned_by_journalists.csv')
female_journalists_mention_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
325050734 | AllysonRaeWx | Banks, Allyson | WUSA–TV | F | 6918 | 330.00 | 7.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 200.00 | 31.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 143.00 | 41.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 115.00 | 55.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 109.00 | 22.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 100.00 | 31.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 87.00 | 14.00 |
26632935 | HopeSeck | Hodge Seck, Hope | Military.com | F | 4584 | 83.00 | 3.00 |
16441088 | jestei | Steinhauer, Jennifer | New York Times | F | 13452 | 76.00 | 26.00 |
18227519 | morningmika | Brzezinski, Mika | MSNBC | F | 653031 | 70.00 | 44.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 67.00 | 29.00 |
139738464 | mj_lee | Lee, MJ | CNN | F | 31940 | 67.00 | 27.00 |
204599219 | pw_cunningham | Cunningham, Paige | Washington Examiner | F | 9255 | 67.00 | 18.00 |
118747545 | eilperin | Eilperin, Juliet | Washington Post | F | 20483 | 67.00 | 16.00 |
360080772 | FoxReports | Fox, Lauren | CNN | F | 7282 | 65.00 | 15.00 |
58869089 | margarettalev | Talev, Margaret | Bloomberg News | F | 19588 | 58.00 | 27.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 58.00 | 5.00 |
19734832 | sarahkliff | Kliff, Sarah L. | Vox Media | F | 100090 | 57.00 | 27.00 |
381664207 | caitlinnowens | Owens, Caitlin N. | Axios | F | 5749 | 57.00 | 9.00 |
167024520 | rachaelmbade | Bade, Rachel M. | Politico | F | 30164 | 56.00 | 26.00 |
247852986 | rachanadixit | Pradhan, Rachana D. | Politico | F | 6178 | 55.00 | 14.00 |
237477771 | juliehdavis | Davis, Julie | New York Times | F | 49821 | 55.00 | 10.00 |
36607254 | Oriana0214 | Pawlyk, Oriana | Military.com | F | 6397 | 55.00 | 4.00 |
28181835 | jpaceDC | Pace, Julie | Associated Press | F | 46017 | 52.00 | 30.00 |
48144950 | JudyWoodruff | Woodruff, Judy | PBS NewsHour | F | 64294 | 49.00 | 7.00 |
female_journalists_mention_summary_df[['mention_count']].describe()
mention_count | |
---|---|
count | 993.00 |
mean | 6.04 |
std | 17.95 |
min | 0.00 |
25% | 0.00 |
50% | 1.00 |
75% | 4.00 |
max | 330.00 |
male_journalists_mention_summary_df = journalists_mention_summary_df[journalists_mention_summary_df.gender == 'M']
male_journalists_mention_summary_df.to_csv('output/male_journalists_mentioned_by_journalists.csv')
male_journalists_mention_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
28496589 | TenaciousTopper | Shutt, Charles | WUSA–TV | M | 15868 | 239.00 | 13.00 |
63149389 | hbwx | Bernstein, Howard | WUSA–TV | M | 8337 | 235.00 | 10.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 212.00 | 46.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 127.00 | 51.00 |
169586280 | WaPoSean | Sullivan, Sean | Washington Post | M | 22860 | 117.00 | 20.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 116.00 | 47.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 107.00 | 43.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 106.00 | 42.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 105.00 | 27.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 104.00 | 40.00 |
9126752 | reporterjoe | Gould, Joseph M. | Sightline Media Group | M | 4702 | 98.00 | 16.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 95.00 | 43.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 84.00 | 41.00 |
48802204 | HardballChris | Matthews, Chris | NBC News | M | 718330 | 80.00 | 9.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 78.00 | 37.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 78.00 | 27.00 |
24439201 | jameshohmann | Hohmann, James P. | Washington Post | M | 38708 | 78.00 | 27.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 75.00 | 37.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 73.00 | 15.00 |
112526560 | kenvogel | Vogel, Kenneth P. | Politico | M | 53894 | 67.00 | 45.00 |
18646108 | BretBaier | Baier, Bret | Fox News | M | 1095184 | 66.00 | 18.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 61.00 | 38.00 |
16067683 | pauldemko | Demko, Paul Jeffrey | Politico | M | 8170 | 60.00 | 13.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 57.00 | 26.00 |
71294756 | wolfblitzer | Blitzer, Wolf | CNN | M | 1281914 | 56.00 | 30.00 |
male_journalists_mention_summary_df[['mention_count']].describe()
mention_count | |
---|---|
count | 1,299.00 |
mean | 6.39 |
std | 17.31 |
min | 0.00 |
25% | 0.00 |
50% | 1.00 |
75% | 5.00 |
max | 239.00 |
journalists_mentioned_by_female_summary_df = journalist_mention_summary(journalists_mention_df[journalists_mention_df.gender == 'F'])
journalists_mentioned_by_female_summary_df.to_csv('output/journalists_mentioned_by_female_journalists.csv')
journalists_mentioned_by_female_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 164.00 | 20.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 116.00 | 13.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 79.00 | 10.00 |
169586280 | WaPoSean | Sullivan, Sean | Washington Post | M | 22860 | 71.00 | 11.00 |
48802204 | HardballChris | Matthews, Chris | NBC News | M | 718330 | 70.00 | 3.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 64.00 | 16.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 61.00 | 6.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 60.00 | 26.00 |
16067683 | pauldemko | Demko, Paul Jeffrey | Politico | M | 8170 | 57.00 | 10.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 53.00 | 2.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 52.00 | 8.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 49.00 | 11.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 47.00 | 10.00 |
247852986 | rachanadixit | Pradhan, Rachana D. | Politico | F | 6178 | 43.00 | 7.00 |
9126752 | reporterjoe | Gould, Joseph M. | Sightline Media Group | M | 4702 | 43.00 | 7.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 40.00 | 21.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 40.00 | 18.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 37.00 | 13.00 |
16149614 | jrovner | Rovner, Julie | Kaiser Health News | F | 21844 | 35.00 | 14.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 35.00 | 13.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 35.00 | 12.00 |
158072303 | ValerieInsinna | Insinna, Valerie | Defense News | F | 4572 | 35.00 | 2.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 33.00 | 18.00 |
342226913 | GregStohr | Stohr, Greg | Bloomberg News | M | 7245 | 32.00 | 2.00 |
297532865 | kwelkernbc | Welker, Kristen | NBC News | F | 99234 | 31.00 | 9.00 |
journalist_mention_gender_summary(journalists_mention_df[journalists_mention_df.gender == 'F'])
count | percentage | |
---|---|---|
M | 3162 | 54.8% |
F | 2605 | 45.2% |
journalists_mentioned_by_male_summary_df = journalist_mention_summary(journalists_mention_df[journalists_mention_df.gender == 'M'])
journalists_mentioned_by_male_summary_df.to_csv('output/journalists_mentioned_by_male_journalists.csv')
journalists_mentioned_by_male_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
325050734 | AllysonRaeWx | Banks, Allyson | WUSA–TV | F | 6918 | 324.00 | 4.00 |
28496589 | TenaciousTopper | Shutt, Charles | WUSA–TV | M | 15868 | 225.00 | 7.00 |
63149389 | hbwx | Bernstein, Howard | WUSA–TV | M | 8337 | 225.00 | 4.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 87.00 | 30.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 84.00 | 30.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 84.00 | 18.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 81.00 | 34.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 79.00 | 25.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 78.00 | 29.00 |
26632935 | HopeSeck | Hodge Seck, Hope | Military.com | F | 4584 | 76.00 | 1.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 71.00 | 22.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 69.00 | 31.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 67.00 | 27.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 66.00 | 29.00 |
16441088 | jestei | Steinhauer, Jennifer | New York Times | F | 13452 | 64.00 | 17.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 62.00 | 12.00 |
24439201 | jameshohmann | Hohmann, James P. | Washington Post | M | 38708 | 59.00 | 17.00 |
18646108 | BretBaier | Baier, Bret | Fox News | M | 1095184 | 59.00 | 14.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 55.00 | 29.00 |
9126752 | reporterjoe | Gould, Joseph M. | Sightline Media Group | M | 4702 | 55.00 | 9.00 |
381664207 | caitlinnowens | Owens, Caitlin N. | Axios | F | 5749 | 55.00 | 7.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 51.00 | 20.00 |
204599219 | pw_cunningham | Cunningham, Paige | Washington Examiner | F | 9255 | 51.00 | 9.00 |
112526560 | kenvogel | Vogel, Kenneth P. | Politico | M | 53894 | 50.00 | 32.00 |
36607254 | Oriana0214 | Pawlyk, Oriana | Military.com | F | 6397 | 50.00 | 3.00 |
journalist_mention_gender_summary(journalists_mention_df[journalists_mention_df.gender == 'M'])
count | percentage | |
---|---|---|
M | 5136 | 60.2% |
F | 3395 | 39.8% |
Including retweets and quotes
# Simply the tweet on load
def retweet_transform(tweet):
if tweet_type(tweet) in ('retweet', 'quote'):
retweet = tweet.get('retweeted_status') or tweet.get('quoted_status')
return {
'tweet_id': tweet['id_str'],
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'retweet_user_id': retweet['user']['id_str'],
'retweet_screen_name': retweet['user']['screen_name'],
'tweet_created_at': date_parse(tweet['created_at'])
}
return None
base_retweet_df = load_tweet_df(retweet_transform, ['tweet_id', 'user_id', 'screen_name', 'retweet_user_id',
'retweet_screen_name', 'tweet_created_at'],
dedupe_columns=['tweet_id'])
base_retweet_df.count()
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000 DEBUG:root:Loaded 800000
tweet_id 456956 user_id 456956 screen_name 456956 retweet_user_id 456956 retweet_screen_name 456956 tweet_created_at 456956 dtype: int64
base_retweet_df.head()
tweet_id | user_id | screen_name | retweet_user_id | retweet_screen_name | tweet_created_at | |
---|---|---|---|---|---|---|
0 | 872631046088601600 | 327862439 | jonathanvswan | 93069110 | maggieNYT | 2017-06-08 01:47:08+00:00 |
1 | 872610483647516673 | 327862439 | jonathanvswan | 160951141 | TomNamako | 2017-06-08 00:25:26+00:00 |
2 | 872609618626826240 | 327862439 | jonathanvswan | 18678924 | jmartNYT | 2017-06-08 00:22:00+00:00 |
3 | 872605974699311104 | 327862439 | jonathanvswan | 93069110 | maggieNYT | 2017-06-08 00:07:31+00:00 |
4 | 872603191518646276 | 327862439 | jonathanvswan | 94784682 | JonathanTurley | 2017-06-07 23:56:27+00:00 |
retweet_df = base_retweet_df.join(user_summary_df['gender'], on='user_id')
retweet_df.count()
tweet_id 456956 user_id 456956 screen_name 456956 retweet_user_id 456956 retweet_screen_name 456956 tweet_created_at 456956 gender 456956 dtype: int64
retweet_df['retweet_user_id'].unique().size
49154
journalists_retweet_df = retweet_df.join(user_summary_df['gender'], how='inner', on='retweet_user_id', rsuffix='_retweet')
journalists_retweet_df.rename(columns = {'gender_retweet': 'retweet_gender'}, inplace=True)
journalists_retweet_df.count()
tweet_id 117048 user_id 117048 screen_name 117048 retweet_user_id 117048 retweet_screen_name 117048 tweet_created_at 117048 gender 117048 retweet_gender 117048 dtype: int64
journalists_retweet_df.head()
tweet_id | user_id | screen_name | retweet_user_id | retweet_screen_name | tweet_created_at | gender | retweet_gender | |
---|---|---|---|---|---|---|---|---|
2 | 872609618626826240 | 327862439 | jonathanvswan | 18678924 | jmartNYT | 2017-06-08 00:22:00+00:00 | M | M |
435 | 871437820044464128 | 242169927 | colinwilhelm | 18678924 | jmartNYT | 2017-06-04 18:45:41+00:00 | M | M |
1406 | 872620054889857024 | 163589845 | PoliticoKevin | 18678924 | jmartNYT | 2017-06-08 01:03:28+00:00 | M | M |
1424 | 872240756597174272 | 163589845 | PoliticoKevin | 18678924 | jmartNYT | 2017-06-06 23:56:16+00:00 | M | M |
1455 | 870749993279385601 | 163589845 | PoliticoKevin | 18678924 | jmartNYT | 2017-06-02 21:12:30+00:00 | M | M |
# Gender of beltway journalists retweeted by beltway journalists
def journalist_retweet_gender_summary(retweet_df):
return pd.DataFrame({'count':retweet_df.retweet_gender.value_counts(),
'percentage': retweet_df.retweet_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
def journalist_retweet_summary(retweet_df):
# Retweet count
retweet_count_df = pd.DataFrame(retweet_df.retweet_user_id.value_counts().rename('retweet_count'))
# Retweeting users. That is, the number of unique users retweeting each user.
retweet_user_id_per_user_df = retweet_df[['retweet_user_id', 'user_id']].drop_duplicates()
retweeting_user_count_df = pd.DataFrame(retweet_user_id_per_user_df.groupby('retweet_user_id').size(), columns=['retweeting_count'])
retweeting_user_count_df.index.name = 'user_id'
# Join with user summary
journalist_retweet_summary_df = user_summary_df.join([retweet_count_df, retweeting_user_count_df])
journalist_retweet_summary_df.fillna(0, inplace=True)
journalist_retweet_summary_df = journalist_retweet_summary_df.sort_values(['retweet_count', 'retweeting_count', 'followers_count'], ascending=False)
return journalist_retweet_summary_df
# Gender of top journalists retweeted by beltway journalists
def top_journalist_retweet_gender_summary(retweet_summary_df, retweeting_count_threshold=0, head=100):
top_retweet_summary_df = retweet_summary_df[retweet_summary_df.retweeting_count > retweeting_count_threshold].head(head)
return pd.DataFrame({'count': top_retweet_summary_df.gender.value_counts(),
'percentage': top_retweet_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
# Fields for displaying journalist mention summaries
journalist_retweet_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'retweet_count', 'retweeting_count']
Note that for each of these, the complete list is being written to CSV in the output directory.
That is, by gender of retweeter.
retweets_by_gender_df = user_summary_df[['gender', 'retweet', 'quote']].groupby('gender').sum()
retweets_by_gender_df['total'] = retweets_by_gender_df.retweet + retweets_by_gender_df.quote
retweets_by_gender_df['percentage'] = retweets_by_gender_df.total.div(retweets_by_gender_df.total.sum()).mul(100).round(1).astype(str) + '%'
retweets_by_gender_df
retweet | quote | total | percentage | |
---|---|---|---|---|
gender | ||||
F | 134,606.00 | 38,998.00 | 173,604.00 | 38.0% |
M | 210,660.00 | 72,692.00 | 283,352.00 | 62.0% |
retweet_user_summary_df = user_summary_df.loc[:,('screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'retweet', 'quote', 'tweets_in_dataset')]
retweet_user_summary_df['retweet_count'] = retweet_user_summary_df.retweet + retweet_user_summary_df.quote
retweet_user_summary_df.sort_values(['retweet_count'], ascending=False).head(25)
screen_name | name | organization | gender | followers_count | tweet_count | retweet | quote | tweets_in_dataset | retweet_count | |
---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||
2453025128 | gloriaminott | Minott, Gloria | WPFW–FM | F | 586 | 61473 | 21,524.00 | 0.00 | 21,547.00 | 21,524.00 |
304988603 | NeilWMcCabe | McCabe, Neil | Breitbart News | M | 18903 | 64673 | 7,528.00 | 625.00 | 9,370.00 | 8,153.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 100803 | 4,449.00 | 1,834.00 | 8,196.00 | 6,283.00 |
191964162 | SamLitzinger | Litzinger, Sam | CBS News | M | 2329 | 95236 | 6,017.00 | 225.00 | 7,537.00 | 6,242.00 |
21612122 | HotlineJosh | Kraushaar, Josh P. | National Journal | M | 50438 | 156610 | 4,881.00 | 893.00 | 6,703.00 | 5,774.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 78015 | 4,570.00 | 822.00 | 6,377.00 | 5,392.00 |
16031927 | greta | Van Susteren, Greta | MSNBC | F | 1186850 | 116645 | 794.00 | 3,069.00 | 4,792.00 | 3,863.00 |
21810329 | sdonnan | Donnan, Shawn | Financial Times | M | 12311 | 79125 | 3,332.00 | 449.00 | 4,537.00 | 3,781.00 |
47408060 | JonathanLanday | Landay, Jonathan | McClatchy Newspapers | M | 11213 | 81042 | 3,687.00 | 80.00 | 4,285.00 | 3,767.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 169908 | 2,703.00 | 859.00 | 4,564.00 | 3,562.00 |
21696279 | brianbeutler | Beutler, Brian Alfred | New Republic | M | 74435 | 99050 | 2,694.00 | 684.00 | 4,560.00 | 3,378.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 104613 | 1,377.00 | 1,955.00 | 4,907.00 | 3,332.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 49967 | 1,740.00 | 1,327.00 | 4,414.00 | 3,067.00 |
456994513 | maria_e_recio | Recio, Maria | Austin American-Statesman | F | 1072 | 40822 | 2,613.00 | 336.00 | 3,370.00 | 2,949.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 41620 | 2,112.00 | 828.00 | 5,567.00 | 2,940.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 92203 | 2,231.00 | 521.00 | 5,187.00 | 2,752.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 148143 | 2,435.00 | 287.00 | 5,078.00 | 2,722.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 27573 | 2,505.00 | 184.00 | 2,871.00 | 2,689.00 |
19545932 | kampeas | Kampeas, Ron | Jewish Telegraphic Agency | M | 6977 | 53053 | 1,988.00 | 444.00 | 3,249.00 | 2,432.00 |
42352386 | rschles | Schlesinger, Robert | U.S. News & World Report | M | 4553 | 35375 | 1,644.00 | 617.00 | 2,459.00 | 2,261.00 |
25702314 | EricMGarcia | Garcia, Eric M. | CQ Roll Call | M | 3094 | 44783 | 528.00 | 1,723.00 | 3,584.00 | 2,251.00 |
18646108 | BretBaier | Baier, Bret | Fox News | M | 1095184 | 52271 | 1,623.00 | 615.00 | 2,379.00 | 2,238.00 |
15486163 | SimonMarksFSN | Marks, Simon | Feature Story News | M | 7767 | 41541 | 1,296.00 | 934.00 | 3,432.00 | 2,230.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 106970 | 1,665.00 | 467.00 | 2,810.00 | 2,132.00 |
15730608 | edroso | Edroso, Roy | UCG | M | 4696 | 38064 | 1,714.00 | 379.00 | 2,883.00 | 2,093.00 |
This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely.
# Retweet count
retweet_count_screen_name_df = pd.DataFrame(retweet_df.retweet_screen_name.value_counts().rename('retweet_count'))
# Count of retweeting users
retweet_user_id_per_user_screen_name_df = retweet_df[['retweet_screen_name', 'user_id']].drop_duplicates()
retweeting_count_screen_name_df = pd.DataFrame(retweet_user_id_per_user_screen_name_df.groupby('retweet_screen_name').size(), columns=['retweeting_count'])
retweeting_count_screen_name_df.index.name = 'screen_name'
all_retweeted_df = retweet_count_screen_name_df.join(retweeting_count_screen_name_df)
all_retweeted_df.to_csv('output/all_retweeted_by_journalists.csv')
all_retweeted_df.head(25)
retweet_count | retweeting_count | |
---|---|---|
realDonaldTrump | 6650 | 807 |
thehill | 5424 | 457 |
BraddJaffy | 3564 | 554 |
maggieNYT | 3024 | 530 |
business | 3000 | 229 |
washingtonpost | 2638 | 498 |
AP | 2480 | 581 |
politico | 2335 | 334 |
nytimes | 2268 | 485 |
WSJ | 1949 | 213 |
burgessev | 1836 | 289 |
kylegriffin1 | 1803 | 429 |
ZekeJMiller | 1723 | 387 |
CNN | 1602 | 366 |
GlennThrush | 1577 | 451 |
Reuters | 1487 | 265 |
jaketapper | 1459 | 397 |
TheEconomist | 1458 | 86 |
StevenTDennis | 1403 | 280 |
FoxNews | 1400 | 258 |
seungminkim | 1393 | 327 |
mkraju | 1359 | 341 |
PhilipRucker | 1349 | 365 |
markknoller | 1343 | 341 |
MEPFuller | 1324 | 286 |
journalists_retweet_summary_df = journalist_retweet_summary(journalists_retweet_df)
journalists_retweet_summary_df.to_csv('output/journalists_retweeted_by_journalists.csv')
journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 1,836.00 | 289.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 1,723.00 | 387.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,577.00 | 451.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,459.00 | 397.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 1,403.00 | 280.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 1,393.00 | 327.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 1,359.00 | 341.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 1,343.00 | 341.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 1,324.00 | 286.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 1,221.00 | 306.00 |
14007532 | frankthorp | Thorp, Frank | NBC News | M | 39798 | 1,207.00 | 334.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 1,186.00 | 296.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 1,177.00 | 297.00 |
104914594 | Phil_Mattingly | Mattingly, Phil | CNN | M | 40119 | 1,120.00 | 314.00 |
16006592 | BenjySarlin | Sarlin, Benjamin | NBC News | M | 78075 | 1,039.00 | 215.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 1,011.00 | 277.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 943.00 | 281.00 |
33653195 | ericawerner | Werner, Erica | Associated Press | F | 14049 | 939.00 | 281.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 916.00 | 247.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 909.00 | 388.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 849.00 | 306.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 829.00 | 315.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 770.00 | 193.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 708.00 | 13.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 705.00 | 155.00 |
journalist_retweet_gender_summary(journalists_retweet_df)
count | percentage | |
---|---|---|
M | 80634 | 68.9% |
F | 36414 | 31.1% |
journalists_retweet_summary_df[['retweet_count']].describe()
retweet_count | |
---|---|
count | 2,292.00 |
mean | 51.07 |
std | 149.06 |
min | 0.00 |
25% | 0.00 |
50% | 6.00 |
75% | 33.00 |
max | 1,836.00 |
female_journalists_retweet_summary_df = journalists_retweet_summary_df[journalists_retweet_summary_df.gender == 'F']
female_journalists_retweet_summary_df.to_csv('output/female_journalists_retweeted_by_journalists.csv')
female_journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 1,393.00 | 327.00 |
33653195 | ericawerner | Werner, Erica | Associated Press | F | 14049 | 939.00 | 281.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 909.00 | 388.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 849.00 | 306.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 708.00 | 13.00 |
167024520 | rachaelmbade | Bade, Rachel M. | Politico | F | 30164 | 614.00 | 161.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 539.00 | 268.00 |
139738464 | mj_lee | Lee, MJ | CNN | F | 31940 | 518.00 | 189.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 474.00 | 136.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 444.00 | 118.00 |
45399148 | jeneps | Epstein, Jennifer | Bloomberg News | F | 61242 | 443.00 | 189.00 |
705706292 | rebeccaballhaus | Ballhaus, Rebecca | Wall Street Journal / Dow Jones | F | 24638 | 409.00 | 154.00 |
19734832 | sarahkliff | Kliff, Sarah L. | Vox Media | F | 100090 | 392.00 | 136.00 |
163995093 | AlexNBCNews | Moe, Alexandra | NBC News | F | 21689 | 388.00 | 134.00 |
237477771 | juliehdavis | Davis, Julie | New York Times | F | 49821 | 375.00 | 194.00 |
16149614 | jrovner | Rovner, Julie | Kaiser Health News | F | 21844 | 351.00 | 137.00 |
116341480 | RosieGray | Gray, Rosie | The Atlantic | F | 96935 | 345.00 | 125.00 |
28181835 | jpaceDC | Pace, Julie | Associated Press | F | 46017 | 328.00 | 132.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 312.00 | 70.00 |
906734342 | KimberlyRobinsn | Robinson, Kimberly S. | Bloomberg BNA | F | 7170 | 308.00 | 38.00 |
188857501 | alexis_levinson | Levinson, Alexis R. | BuzzFeed | F | 25375 | 288.00 | 111.00 |
56552341 | LACaldwellDC | Caldwell, Leigh Ann | NBC News | F | 8464 | 282.00 | 98.00 |
151444950 | DaviSusan | Davis, Susan | National Public Radio | F | 27297 | 270.00 | 150.00 |
360080772 | FoxReports | Fox, Lauren | CNN | F | 7282 | 269.00 | 116.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 269.00 | 115.00 |
female_journalists_retweet_summary_df[['retweet_count']].describe()
retweet_count | |
---|---|
count | 993.00 |
mean | 36.67 |
std | 97.34 |
min | 0.00 |
25% | 0.00 |
50% | 5.00 |
75% | 25.00 |
max | 1,393.00 |
male_journalists_retweet_summary_df = journalists_retweet_summary_df[journalists_retweet_summary_df.gender == 'M']
male_journalists_retweet_summary_df.to_csv('output/male_journalists_retweeted_by_journalists.csv')
male_journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 1,836.00 | 289.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 1,723.00 | 387.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,577.00 | 451.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,459.00 | 397.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 1,403.00 | 280.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 1,359.00 | 341.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 1,343.00 | 341.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 1,324.00 | 286.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 1,221.00 | 306.00 |
14007532 | frankthorp | Thorp, Frank | NBC News | M | 39798 | 1,207.00 | 334.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 1,186.00 | 296.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 1,177.00 | 297.00 |
104914594 | Phil_Mattingly | Mattingly, Phil | CNN | M | 40119 | 1,120.00 | 314.00 |
16006592 | BenjySarlin | Sarlin, Benjamin | NBC News | M | 78075 | 1,039.00 | 215.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 1,011.00 | 277.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 943.00 | 281.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 916.00 | 247.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 829.00 | 315.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 770.00 | 193.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 705.00 | 155.00 |
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 704.00 | 225.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 699.00 | 223.00 |
22129280 | jimsciutto | Sciutto, James | CNN | M | 172012 | 688.00 | 242.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 654.00 | 284.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 642.00 | 229.00 |
male_journalists_retweet_summary_df[['retweet_count']].describe()
retweet_count | |
---|---|
count | 1,299.00 |
mean | 62.07 |
std | 178.04 |
min | 0.00 |
25% | 1.00 |
50% | 8.00 |
75% | 39.50 |
max | 1,836.00 |
journalists_retweeted_by_female_summary_df = journalist_retweet_summary(journalists_retweet_df[journalists_retweet_df.gender == 'F'])
journalists_retweeted_by_female_summary_df.to_csv('output/journalists_retweeted_by_female_journalists.csv')
journalists_retweeted_by_female_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 748.00 | 122.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 704.00 | 9.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 572.00 | 142.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 549.00 | 140.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 516.00 | 149.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 503.00 | 97.00 |
14007532 | frankthorp | Thorp, Frank | NBC News | M | 39798 | 470.00 | 140.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 463.00 | 165.00 |
33653195 | ericawerner | Werner, Erica | Associated Press | F | 14049 | 452.00 | 119.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 447.00 | 116.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 403.00 | 132.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 388.00 | 158.00 |
104914594 | Phil_Mattingly | Mattingly, Phil | CNN | M | 40119 | 372.00 | 129.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 367.00 | 67.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 365.00 | 122.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 344.00 | 164.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 338.00 | 103.00 |
167024520 | rachaelmbade | Bade, Rachel M. | Politico | F | 30164 | 303.00 | 59.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 302.00 | 106.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 287.00 | 61.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 279.00 | 111.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 265.00 | 119.00 |
139738464 | mj_lee | Lee, MJ | CNN | F | 31940 | 259.00 | 79.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 256.00 | 82.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 253.00 | 115.00 |
journalist_retweet_gender_summary(journalists_retweet_df[journalists_retweet_df.gender == 'F'])
count | percentage | |
---|---|---|
M | 25410 | 59.6% |
F | 17228 | 40.4% |
female_journalists_retweet_df = journalists_retweet_df[journalists_retweet_df.gender == 'F']
female_journalists_retweet_df.groupby(['user_id', 'retweet_gender']).size().unstack().describe()
retweet_gender | F | M |
---|---|---|
count | 736.00 | 771.00 |
mean | 23.41 | 32.96 |
std | 51.31 | 83.17 |
min | 1.00 | 1.00 |
25% | 3.00 | 4.00 |
50% | 8.00 | 10.00 |
75% | 23.00 | 32.00 |
max | 857.00 | 1,779.00 |
journalists_retweeted_by_male_summary_df = journalist_retweet_summary(journalists_retweet_df[journalists_retweet_df.gender == 'M'])
journalists_retweeted_by_male_summary_df.to_csv('output/journalists_retweeted_by_male_journalists.csv')
journalists_retweeted_by_male_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 1,207.00 | 238.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,114.00 | 286.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 1,088.00 | 167.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,071.00 | 239.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 975.00 | 209.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 956.00 | 209.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 900.00 | 183.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 877.00 | 170.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 848.00 | 193.00 |
16006592 | BenjySarlin | Sarlin, Benjamin | NBC News | M | 78075 | 828.00 | 141.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 821.00 | 185.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 812.00 | 175.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 794.00 | 201.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 777.00 | 196.00 |
104914594 | Phil_Mattingly | Mattingly, Phil | CNN | M | 40119 | 748.00 | 185.00 |
14007532 | frankthorp | Thorp, Frank | NBC News | M | 39798 | 737.00 | 194.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 726.00 | 167.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 641.00 | 175.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 583.00 | 127.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 570.00 | 195.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 565.00 | 224.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 564.00 | 196.00 |
19580890 | LeeCamp | Camp, Lee | RTTV America | M | 67601 | 560.00 | 6.00 |
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 524.00 | 149.00 |
22129280 | jimsciutto | Sciutto, James | CNN | M | 172012 | 507.00 | 151.00 |
journalist_retweet_gender_summary(journalists_retweet_df[journalists_retweet_df.gender == 'M'])
count | percentage | |
---|---|---|
M | 55224 | 74.2% |
F | 19186 | 25.8% |
male_journalists_retweet_df = journalists_retweet_df[journalists_retweet_df.gender == 'M']
male_journalists_retweet_df.groupby(['user_id', 'retweet_gender']).size().unstack().describe()
retweet_gender | F | M |
---|---|---|
count | 886.00 | 1,002.00 |
mean | 21.65 | 55.11 |
std | 38.69 | 118.80 |
min | 1.00 | 1.00 |
25% | 3.00 | 4.00 |
50% | 8.00 | 15.00 |
75% | 23.00 | 52.00 |
max | 442.00 | 1,414.00 |
# Simply the tweet on load
def reply_transform(tweet):
if tweet_type(tweet) == 'reply':
return {
'tweet_id': tweet['id_str'],
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'reply_to_user_id': tweet['in_reply_to_user_id_str'],
'reply_to_screen_name': tweet['in_reply_to_screen_name'],
'tweet_created_at': date_parse(tweet['created_at'])
}
return None
base_reply_df = load_tweet_df(reply_transform, ['tweet_id', 'user_id', 'screen_name', 'reply_to_user_id',
'reply_to_screen_name', 'tweet_created_at'],
dedupe_columns=['tweet_id'])
base_reply_df.count()
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000 DEBUG:root:Loaded 800000
tweet_id 126254 user_id 126254 screen_name 126254 reply_to_user_id 126254 reply_to_screen_name 126254 tweet_created_at 126254 dtype: int64
base_reply_df.head()
tweet_id | user_id | screen_name | reply_to_user_id | reply_to_screen_name | tweet_created_at | |
---|---|---|---|---|---|---|
0 | 872495244062978048 | 327862439 | jonathanvswan | 59331128 | PhilipRucker | 2017-06-07 16:47:31+00:00 |
1 | 872473152160399361 | 327862439 | jonathanvswan | 2856617865 | RPhuket | 2017-06-07 15:19:43+00:00 |
2 | 872266930341728256 | 327862439 | jonathanvswan | 1854392378 | hrm_1973 | 2017-06-07 01:40:16+00:00 |
3 | 872250430109175809 | 327862439 | jonathanvswan | 390985197 | MikeBastasch | 2017-06-07 00:34:42+00:00 |
4 | 872218322187767808 | 327862439 | jonathanvswan | 407013776 | burgessev | 2017-06-06 22:27:07+00:00 |
reply_df = base_reply_df.join(user_summary_df['gender'], on='user_id')
reply_df.count()
tweet_id 126254 user_id 126254 screen_name 126254 reply_to_user_id 126254 reply_to_screen_name 126254 tweet_created_at 126254 gender 126254 dtype: int64
reply_df['reply_to_user_id'].unique().size
31034
journalists_reply_df = reply_df.join(user_summary_df['gender'], how='inner', on='reply_to_user_id', rsuffix='_reply')
journalists_reply_df.rename(columns = {'gender_reply': 'reply_to_gender'}, inplace=True)
journalists_reply_df.count()
tweet_id 43390 user_id 43390 screen_name 43390 reply_to_user_id 43390 reply_to_screen_name 43390 tweet_created_at 43390 gender 43390 reply_to_gender 43390 dtype: int64
journalists_reply_df.head()
tweet_id | user_id | screen_name | reply_to_user_id | reply_to_screen_name | tweet_created_at | gender | reply_to_gender | |
---|---|---|---|---|---|---|---|---|
4 | 872218322187767808 | 327862439 | jonathanvswan | 407013776 | burgessev | 2017-06-06 22:27:07+00:00 | M | M |
234 | 871795694020984833 | 195840597 | JNicholsonInDC | 407013776 | burgessev | 2017-06-05 18:27:45+00:00 | M | M |
572 | 870371176866041856 | 163589845 | PoliticoKevin | 407013776 | burgessev | 2017-06-01 20:07:13+00:00 | M | M |
728 | 870659438901940224 | 115564212 | IsaacDovere | 407013776 | burgessev | 2017-06-02 15:12:40+00:00 | M | M |
731 | 872473152143667201 | 167024520 | rachaelmbade | 407013776 | burgessev | 2017-06-07 15:19:43+00:00 | F | M |
# Gender of beltway journalists replied to by beltway journalists
def journalist_reply_gender_summary(reply_df):
return pd.DataFrame({'count':reply_df.reply_to_gender.value_counts(),
'percentage': reply_df.reply_to_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
# Reply to beltway journalists by beltway journalists
def journalist_reply_summary(reply_df):
# Reply to count
reply_count_df = pd.DataFrame(reply_df.reply_to_user_id.value_counts().rename('reply_to_count'))
# Replying to users. That is, the number of unique users replying to each user.
reply_to_user_id_per_user_df = reply_df[['reply_to_user_id', 'user_id']].drop_duplicates()
replying_to_user_count_df = pd.DataFrame(reply_to_user_id_per_user_df.groupby('reply_to_user_id').size(), columns=['replying_count'])
replying_to_user_count_df.index.name = 'user_id'
# Join with user summary
journalist_reply_summary_df = user_summary_df.join([reply_count_df, replying_to_user_count_df])
journalist_reply_summary_df.fillna(0, inplace=True)
journalist_reply_summary_df = journalist_reply_summary_df.sort_values(['reply_to_count', 'replying_count', 'followers_count'], ascending=False)
return journalist_reply_summary_df
# Gender of top journalists replied to by beltway journalists
def top_journalist_reply_gender_summary(reply_summary_df, replying_count_threshold=0, head=100):
top_reply_summary_df = reply_summary_df[reply_summary_df.replying_count > replying_count_threshold].head(head)
return pd.DataFrame({'count': top_reply_summary_df.gender.value_counts(),
'percentage': top_reply_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
# Fields for displaying journalist mention summaries
journalist_reply_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'reply_to_count', 'replying_count']
Note that for each of these, the complete list is being written to CSV in the output directory.
replies_by_gender_df = user_summary_df[['gender', 'reply']].groupby('gender').sum()
replies_by_gender_df['percentage'] = replies_by_gender_df.reply.div(replies_by_gender_df.reply.sum()).mul(100).round(1).astype(str) + '%'
replies_by_gender_df
reply | percentage | |
---|---|---|
gender | ||
F | 31,831.00 | 25.2% |
M | 94,423.00 | 74.8% |
user_summary_df[['screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'reply', 'tweets_in_dataset']].sort_values(['reply'], ascending=False).head(25)
screen_name | name | organization | gender | followers_count | tweet_count | reply | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
user_id | ||||||||
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 208763 | 9,033.00 | 11,432.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 205504 | 3,917.00 | 6,244.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 42497 | 2,040.00 | 3,960.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 41620 | 1,949.00 | 5,567.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 142150 | 1,714.00 | 3,983.00 |
63717541 | phillyrich1 | Weinstein, Richard | C–SPAN | M | 3827 | 27341 | 1,532.00 | 2,261.00 |
203226736 | SharylAttkisson | Attkisson, Sharyl | Sinclair Broadcast Group | F | 132973 | 24539 | 1,458.00 | 2,154.00 |
16812908 | crousselle | Rousselle, Christine | Townhall | F | 5327 | 118713 | 1,089.00 | 2,351.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 148143 | 1,040.00 | 5,078.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 67526 | 1,026.00 | 3,066.00 |
27882000 | jamiedupree | Dupree, Jamie | Cox Broadcasting | M | 140848 | 46181 | 993.00 | 2,108.00 |
3372900155 | samtayrey | Reyes, Samantha | CNN | F | 10344 | 4783 | 933.00 | 1,349.00 |
132482136 | Yaro_RT | Yaroshevsky, Alexey | RTTV America | M | 12968 | 26795 | 910.00 | 1,199.00 |
46955476 | GrahamDavidA | Graham, David A. | The Atlantic | M | 22112 | 93391 | 908.00 | 1,566.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 92203 | 901.00 | 5,187.00 |
25702314 | EricMGarcia | Garcia, Eric M. | CQ Roll Call | M | 3094 | 44783 | 863.00 | 3,584.00 |
12245632 | jackshafer | Shafer, Jack | Politico | M | 73996 | 44726 | 861.00 | 2,016.00 |
273540698 | MKTWgoldstein | Goldstein, Steven | MarketWatch | M | 10185 | 41497 | 857.00 | 1,897.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 51628 | 853.00 | 2,022.00 |
6904552 | juliemason | Mason, Julie | Sirius XM Satellite Radio | F | 31276 | 29214 | 852.00 | 1,213.00 |
225265639 | ddale8 | Dale, Daniel | Toronto Star | M | 180671 | 69807 | 848.00 | 2,496.00 |
15837659 | jbendery | Bendery, Jennifer | Huffington Post | M | 41000 | 65406 | 844.00 | 2,600.00 |
15146659 | JSwiftTWS | Swift, James A. | Weekly Standard | M | 5691 | 84245 | 830.00 | 2,612.00 |
227790723 | RichardRubinDC | Rubin, Richard | Bloomberg News | M | 13015 | 17796 | 807.00 | 1,312.00 |
14517538 | derekwillis | Willis, Derek | ProPublica | M | 18049 | 79502 | 781.00 | 1,811.00 |
This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely.
# Reply to count
reply_to_count_screen_name_df = pd.DataFrame(reply_df.reply_to_screen_name.value_counts().rename('reply_to_count'))
# Count of replying users
reply_to_user_id_per_user_screen_name_df = reply_df[['reply_to_screen_name', 'user_id']].drop_duplicates()
replying_count_screen_name_df = pd.DataFrame(reply_to_user_id_per_user_screen_name_df.groupby('reply_to_screen_name').size(), columns=['replying_count'])
replying_count_screen_name_df.index.name = 'screen_name'
all_replied_to_df = reply_to_count_screen_name_df.join(replying_count_screen_name_df)
all_replied_to_df.to_csv('output/all_replied_to_by_journalists.csv')
all_replied_to_df.head(25)
reply_to_count | replying_count | |
---|---|---|
ericgeller | 1980 | 75 |
chrisgeidner | 1901 | 37 |
dylanlscott | 1091 | 65 |
JaredRizzi | 750 | 46 |
StevenTDennis | 745 | 93 |
AlexParkerDC | 720 | 23 |
sahilkapur | 662 | 35 |
jseldin | 653 | 2 |
MEPFuller | 522 | 92 |
amaxsmith | 498 | 6 |
ddale8 | 495 | 20 |
CraigCaplan | 388 | 8 |
ChuckWendig | 372 | 1 |
pbump | 355 | 43 |
kelmej | 340 | 29 |
benjamin_oc | 322 | 11 |
KimberlyRobinsn | 321 | 7 |
darth | 315 | 32 |
ZoeTillman | 311 | 8 |
RichardRubinDC | 305 | 41 |
sdonnan | 304 | 7 |
AaronMehta | 304 | 35 |
MikeSacksEsq | 299 | 18 |
heathdwilliams | 298 | 1 |
ryanbeckwith | 297 | 49 |
journalists_reply_summary_df = journalist_reply_summary(journalists_reply_df)
journalists_reply_summary_df.to_csv('output/journalists_replied_to_by_journalists.csv')
journalists_reply_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 1,980.00 | 75.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 1,901.00 | 37.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 1,091.00 | 65.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 750.00 | 46.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 745.00 | 93.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 720.00 | 23.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 662.00 | 35.00 |
583821006 | jseldin | Seldin, Jeff | Voice of America | M | 5365 | 653.00 | 2.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 522.00 | 92.00 |
44951698 | amaxsmith | Smith, Max | WTOP Radio | M | 4726 | 498.00 | 6.00 |
225265639 | ddale8 | Dale, Daniel | Toronto Star | M | 180671 | 495.00 | 20.00 |
317980134 | CraigCaplan | Caplan, Craig | C–SPAN | M | 6143 | 388.00 | 8.00 |
16061946 | kelmej | Mejdrich, Kellie | CQ Roll Call | F | 4146 | 340.00 | 29.00 |
15365623 | benjamin_oc | O’Connell, Benjamin | C–SPAN | M | 1455 | 322.00 | 11.00 |
906734342 | KimberlyRobinsn | Robinson, Kimberly S. | Bloomberg BNA | F | 7170 | 321.00 | 7.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 311.00 | 8.00 |
227790723 | RichardRubinDC | Rubin, Richard | Bloomberg News | M | 13015 | 305.00 | 41.00 |
103016675 | AaronMehta | Mehta, Aaron | Sightline Media Group | M | 11124 | 304.00 | 35.00 |
21810329 | sdonnan | Donnan, Shawn | Financial Times | M | 12311 | 304.00 | 7.00 |
90478926 | MikeSacksEsq | Sacks, Mike | Scripps Howard News Service | M | 9289 | 299.00 | 18.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 297.00 | 49.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 283.00 | 72.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 269.00 | 45.00 |
21696279 | brianbeutler | Beutler, Brian Alfred | New Republic | M | 74435 | 269.00 | 34.00 |
21212087 | Olivianuzzi | Nuzzi, Olivia | New York | F | 136276 | 243.00 | 25.00 |
journalist_reply_gender_summary(journalists_reply_df)
count | percentage | |
---|---|---|
M | 33178 | 76.5% |
F | 10212 | 23.5% |
journalists_reply_summary_df[['reply_to_count', 'replying_count']].describe()
reply_to_count | replying_count | |
---|---|---|
count | 2,292.00 | 2,292.00 |
mean | 18.93 | 3.81 |
std | 81.76 | 8.41 |
min | 0.00 | 0.00 |
25% | 0.00 | 0.00 |
50% | 1.00 | 1.00 |
75% | 8.00 | 4.00 |
max | 1,980.00 | 93.00 |
top_journalist_reply_gender_summary(journalists_reply_summary_df, replying_count_threshold=0)
count | percentage | |
---|---|---|
M | 81 | 81.0% |
F | 19 | 19.0% |
female_journalists_reply_summary_df = journalists_reply_summary_df[journalists_reply_summary_df.gender == 'F']
female_journalists_reply_summary_df.to_csv('output/female_journalists_replied_to_by_journalists.csv')
female_journalists_reply_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
16061946 | kelmej | Mejdrich, Kellie | CQ Roll Call | F | 4146 | 340.00 | 29.00 |
906734342 | KimberlyRobinsn | Robinson, Kimberly S. | Bloomberg BNA | F | 7170 | 321.00 | 7.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 311.00 | 8.00 |
21212087 | Olivianuzzi | Nuzzi, Olivia | New York | F | 136276 | 243.00 | 25.00 |
83462293 | SarahMMimms | Mimms, Sarah | BuzzFeed | F | 6216 | 236.00 | 24.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 233.00 | 84.00 |
3372900155 | samtayrey | Reyes, Samantha | CNN | F | 10344 | 219.00 | 18.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 212.00 | 48.00 |
1132012321 | DaniellaMicaela | Diaz, Daniella | CNN | F | 14612 | 181.00 | 36.00 |
158072303 | ValerieInsinna | Insinna, Valerie | Defense News | F | 4572 | 175.00 | 20.00 |
36607254 | Oriana0214 | Pawlyk, Oriana | Military.com | F | 6397 | 174.00 | 21.00 |
96405362 | laurenonthehill | Camera, Lauren S. | U.S. News & World Report | F | 3396 | 162.00 | 6.00 |
16812908 | crousselle | Rousselle, Christine | Townhall | F | 5327 | 149.00 | 5.00 |
47758416 | marissaaevans | Evans, Marissa | Texas Tribune | F | 6850 | 137.00 | 1.00 |
45399148 | jeneps | Epstein, Jennifer | Bloomberg News | F | 61242 | 134.00 | 23.00 |
16434028 | gabbilevy | Levy, Gabrielle F. | U.S. News & World Report | F | 2209 | 132.00 | 4.00 |
14870670 | KateNocera | Nocera, Kate | BuzzFeed | F | 27714 | 116.00 | 36.00 |
18501487 | leighmunsil | Munsil, Leigh | CNN | F | 11059 | 107.00 | 30.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 104.00 | 12.00 |
116341480 | RosieGray | Gray, Rosie | The Atlantic | F | 96935 | 99.00 | 31.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 96.00 | 44.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 95.00 | 47.00 |
38855868 | brennawilliams | Williams, Brenna | CNN | F | 7299 | 93.00 | 22.00 |
273700859 | kpolantz | Polantz, Katelyn J. | National Law Journal | F | 2483 | 91.00 | 6.00 |
3273220608 | KatherineBScott | Scott, Katherine | Bloomberg Government | F | 1841 | 85.00 | 14.00 |
female_journalists_reply_summary_df[['reply_to_count', 'replying_count']].describe()
reply_to_count | replying_count | |
---|---|---|
count | 993.00 | 993.00 |
mean | 10.28 | 2.95 |
std | 31.00 | 6.33 |
min | 0.00 | 0.00 |
25% | 0.00 | 0.00 |
50% | 1.00 | 1.00 |
75% | 6.00 | 3.00 |
max | 340.00 | 84.00 |
male_journalists_reply_summary_df = journalists_reply_summary_df[journalists_reply_summary_df.gender == 'M']
male_journalists_reply_summary_df.to_csv('output/male_journalists_replied_to_by_journalists.csv')
male_journalists_reply_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 1,980.00 | 75.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 1,901.00 | 37.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 1,091.00 | 65.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 750.00 | 46.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 745.00 | 93.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 720.00 | 23.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 662.00 | 35.00 |
583821006 | jseldin | Seldin, Jeff | Voice of America | M | 5365 | 653.00 | 2.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 522.00 | 92.00 |
44951698 | amaxsmith | Smith, Max | WTOP Radio | M | 4726 | 498.00 | 6.00 |
225265639 | ddale8 | Dale, Daniel | Toronto Star | M | 180671 | 495.00 | 20.00 |
317980134 | CraigCaplan | Caplan, Craig | C–SPAN | M | 6143 | 388.00 | 8.00 |
15365623 | benjamin_oc | O’Connell, Benjamin | C–SPAN | M | 1455 | 322.00 | 11.00 |
227790723 | RichardRubinDC | Rubin, Richard | Bloomberg News | M | 13015 | 305.00 | 41.00 |
103016675 | AaronMehta | Mehta, Aaron | Sightline Media Group | M | 11124 | 304.00 | 35.00 |
21810329 | sdonnan | Donnan, Shawn | Financial Times | M | 12311 | 304.00 | 7.00 |
90478926 | MikeSacksEsq | Sacks, Mike | Scripps Howard News Service | M | 9289 | 299.00 | 18.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 297.00 | 49.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 283.00 | 72.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 269.00 | 45.00 |
21696279 | brianbeutler | Beutler, Brian Alfred | New Republic | M | 74435 | 269.00 | 34.00 |
190360266 | connorobrienNH | O’Brien, Connor | Politico | M | 6158 | 241.00 | 35.00 |
63717541 | phillyrich1 | Weinstein, Richard | C–SPAN | M | 3827 | 241.00 | 4.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 238.00 | 79.00 |
80111587 | JeffYoung | Young, Jeffrey | Huffington Post | M | 26497 | 238.00 | 31.00 |
male_journalists_reply_summary_df[['reply_to_count', 'replying_count']].describe()
reply_to_count | replying_count | |
---|---|---|
count | 1,299.00 | 1,299.00 |
mean | 25.54 | 4.46 |
std | 104.71 | 9.66 |
min | 0.00 | 0.00 |
25% | 0.00 | 0.00 |
50% | 1.00 | 1.00 |
75% | 11.00 | 4.00 |
max | 1,980.00 | 93.00 |
journalists_replied_to_by_female_summary_df = journalist_reply_summary(journalists_reply_df[journalists_reply_df.gender == 'F'])
journalists_replied_to_by_female_summary_df.to_csv('output/journalists_replied_to_by_female_journalists.csv')
journalists_replied_to_by_female_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
906734342 | KimberlyRobinsn | Robinson, Kimberly S. | Bloomberg BNA | F | 7170 | 313.00 | 2.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 305.00 | 3.00 |
16061946 | kelmej | Mejdrich, Kellie | CQ Roll Call | F | 4146 | 295.00 | 15.00 |
83462293 | SarahMMimms | Mimms, Sarah | BuzzFeed | F | 6216 | 195.00 | 7.00 |
21212087 | Olivianuzzi | Nuzzi, Olivia | New York | F | 136276 | 190.00 | 9.00 |
3372900155 | samtayrey | Reyes, Samantha | CNN | F | 10344 | 179.00 | 7.00 |
96405362 | laurenonthehill | Camera, Lauren S. | U.S. News & World Report | F | 3396 | 159.00 | 5.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 148.00 | 18.00 |
1132012321 | DaniellaMicaela | Diaz, Daniella | CNN | F | 14612 | 144.00 | 22.00 |
16812908 | crousselle | Rousselle, Christine | Townhall | F | 5327 | 144.00 | 3.00 |
47758416 | marissaaevans | Evans, Marissa | Texas Tribune | F | 6850 | 137.00 | 1.00 |
36607254 | Oriana0214 | Pawlyk, Oriana | Military.com | F | 6397 | 133.00 | 5.00 |
16434028 | gabbilevy | Levy, Gabrielle F. | U.S. News & World Report | F | 2209 | 130.00 | 2.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 108.00 | 36.00 |
45399148 | jeneps | Epstein, Jennifer | Bloomberg News | F | 61242 | 103.00 | 7.00 |
158072303 | ValerieInsinna | Insinna, Valerie | Defense News | F | 4572 | 97.00 | 8.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 97.00 | 5.00 |
18501487 | leighmunsil | Munsil, Leigh | CNN | F | 11059 | 88.00 | 13.00 |
273700859 | kpolantz | Polantz, Katelyn J. | National Law Journal | F | 2483 | 84.00 | 2.00 |
114670081 | rebleber | Leber, Rebecca J. | Mother Jones | F | 16467 | 79.00 | 3.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 78.00 | 30.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 78.00 | 20.00 |
116341480 | RosieGray | Gray, Rosie | The Atlantic | F | 96935 | 73.00 | 13.00 |
103016675 | AaronMehta | Mehta, Aaron | Sightline Media Group | M | 11124 | 72.00 | 10.00 |
48038024 | karentravers | Travers, Karen | ABC News | F | 17155 | 71.00 | 7.00 |
journalist_reply_gender_summary(journalists_reply_df[journalists_reply_df.gender == 'F'])
count | percentage | |
---|---|---|
F | 7412 | 72.1% |
M | 2864 | 27.9% |
top_journalist_reply_gender_summary(journalists_replied_to_by_female_summary_df, replying_count_threshold=0)
count | percentage | |
---|---|---|
F | 75 | 75.0% |
M | 25 | 25.0% |
journalists_replied_to_by_male_summary_df = journalist_reply_summary(journalists_reply_df[journalists_reply_df.gender == 'M'])
journalists_replied_to_by_male_summary_df.to_csv('output/journalists_replied_to_by_male_journalists.csv')
journalists_replied_to_by_male_summary_df[journalist_reply_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | reply_to_count | replying_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 1,926.00 | 58.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 1,864.00 | 28.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 1,013.00 | 45.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 726.00 | 35.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 709.00 | 20.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 686.00 | 61.00 |
583821006 | jseldin | Seldin, Jeff | Voice of America | M | 5365 | 653.00 | 2.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 646.00 | 24.00 |
44951698 | amaxsmith | Smith, Max | WTOP Radio | M | 4726 | 495.00 | 4.00 |
225265639 | ddale8 | Dale, Daniel | Toronto Star | M | 180671 | 490.00 | 16.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 456.00 | 64.00 |
317980134 | CraigCaplan | Caplan, Craig | C–SPAN | M | 6143 | 388.00 | 8.00 |
15365623 | benjamin_oc | O’Connell, Benjamin | C–SPAN | M | 1455 | 318.00 | 8.00 |
21810329 | sdonnan | Donnan, Shawn | Financial Times | M | 12311 | 303.00 | 6.00 |
90478926 | MikeSacksEsq | Sacks, Mike | Scripps Howard News Service | M | 9289 | 294.00 | 13.00 |
227790723 | RichardRubinDC | Rubin, Richard | Bloomberg News | M | 13015 | 284.00 | 33.00 |
21696279 | brianbeutler | Beutler, Brian Alfred | New Republic | M | 74435 | 262.00 | 29.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 249.00 | 52.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 241.00 | 30.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 240.00 | 35.00 |
63717541 | phillyrich1 | Weinstein, Richard | C–SPAN | M | 3827 | 240.00 | 3.00 |
103016675 | AaronMehta | Mehta, Aaron | Sightline Media Group | M | 11124 | 232.00 | 25.00 |
26559241 | fordm | Ford, Matt S. | The Atlantic | M | 27571 | 232.00 | 15.00 |
437019753 | TimothyNoah1 | Noah, Timothy R. | Politico | M | 15090 | 231.00 | 12.00 |
23332846 | mattzap | Zapotosky, Matt | Washington Post | M | 56887 | 230.00 | 7.00 |
journalists_replied_to_by_male_summary_df.count()
screen_name 2292 name 2292 organization 2292 position 2292 gender 2292 followers_count 2292 following_count 2292 tweet_count 2292 user_created_at 2292 verified 2292 protected 2292 original 2292 quote 2292 reply 2292 retweet 2292 tweets_in_dataset 2292 reply_to_count 2292 replying_count 2292 dtype: int64
journalist_reply_gender_summary(journalists_reply_df[journalists_reply_df.gender == 'M'])
count | percentage | |
---|---|---|
M | 30314 | 91.5% |
F | 2800 | 8.5% |
top_journalist_reply_gender_summary(journalists_replied_to_by_male_summary_df, replying_count_threshold=0)
count | percentage | |
---|---|---|
M | 98 | 98.0% |
F | 2 | 2.0% |
Users that are followed by beltway journalists
base_follower_to_followed_df = pd.read_csv('source_data/follower_to_followed.csv',
names=['follower_user_id', 'followed_user_id'],
dtype={'follower_user_id': np.str, 'followed_user_id': np.str})
base_follower_to_followed_df.drop_duplicates(inplace=True)
base_follower_to_followed_df.count()
follower_user_id 3417018 followed_user_id 3417018 dtype: int64
base_follower_to_followed_df.head()
follower_user_id | followed_user_id | |
---|---|---|
0 | 91156486 | 3092427779 |
1 | 91156486 | 36953109 |
2 | 91156486 | 424274008 |
3 | 91156486 | 779044378929168384 |
4 | 91156486 | 339834914 |
user_info_df.head()
name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | |
---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||
20711445 | Glinski, Nina | NaN | Freelance Reporter | F | 963 | 507 | 909 | Thu Feb 12 20:00:53 +0000 2009 | False | False |
258917371 | Enders, David | NaN | Journalist | M | 1444 | 484 | 6296 | Mon Feb 28 19:52:03 +0000 2011 | True | False |
297046834 | Barakat, Matthew | Associated Press | Northern Virginia Correspondent | M | 759 | 352 | 631 | Wed May 11 20:55:24 +0000 2011 | True | False |
455585786 | Atkins, Kimberly | Boston Herald | Chief Washington Reporter/Columnist | F | 2944 | 2691 | 6277 | Thu Jan 05 08:26:46 +0000 2012 | True | False |
42584840 | Vlahou, Toula | CQ Roll Call | Editor & Podcast Producer | F | 2703 | 201 | 6366 | Tue May 26 07:41:38 +0000 2009 | False | False |
# This will drop followers of journalists that have no tweets
follower_to_followed_df = base_follower_to_followed_df.join(user_summary_df['gender'], on='follower_user_id', how='inner')
follower_to_followed_df.count()
follower_user_id 3311406 followed_user_id 3311406 gender 3311406 dtype: int64
follower_to_followed_df.head()
follower_user_id | followed_user_id | gender | |
---|---|---|---|
261 | 15219888 | 3291076716 | F |
262 | 15219888 | 119175339 | F |
263 | 15219888 | 418837047 | F |
264 | 15219888 | 259817885 | F |
265 | 15219888 | 287263845 | F |
followed_screen_name_lookup_df = pd.read_csv('source_data/followed.csv',
names=['screen_name', 'user_id'],
dtype={'user_id': np.str}).set_index(['user_id'])
followed_screen_name_lookup_df.head()
screen_name | |
---|---|
user_id | |
17665874 | onlinehigh |
2389275799 | HLSPOLICY |
314728983 | Veolia_NA |
239409802 | fishingbuk |
522799320 | GoldsmithBev |
follower_to_journalist_followed_df = follower_to_followed_df.join(user_summary_df['gender'], how='inner', on='followed_user_id', rsuffix='_followed')
follower_to_journalist_followed_df.rename(columns = {'gender_followed': 'followed_gender'}, inplace=True)
follower_to_journalist_followed_df.count()
follower_user_id 280340 followed_user_id 280340 gender 280340 followed_gender 280340 dtype: int64
follower_to_journalist_followed_df.head()
follower_user_id | followed_user_id | gender | followed_gender | |
---|---|---|---|---|
287 | 15219888 | 46582653 | F | M |
21810 | 15780280 | 46582653 | M | M |
24153 | 14245722 | 46582653 | M | M |
40694 | 37865281 | 46582653 | F | M |
66585 | 165204211 | 46582653 | M | M |
# Gender of beltway journalists followed by beltway journalists
def journalist_followed_gender_summary(follower_to_followed_df):
gender_summary_df = pd.DataFrame({'count':follower_to_followed_df.followed_gender.value_counts(),
'percentage': follower_to_followed_df.followed_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
gender_summary_df.reset_index(inplace=True)
gender_summary_df['avg_followed'] = gender_summary_df.apply(lambda row: row['count'] / user_info_df[user_info_df.gender == row['index']]['name'].count(), axis=1)
gender_summary_df.set_index('index', inplace=True, drop=True)
return gender_summary_df
def journalist_following_summary(follower_to_followed_df):
# Following count
following_count_df = pd.DataFrame(follower_to_followed_df.followed_user_id.value_counts().rename('journalist_follower_count'))
# Join with user summary
journalist_following_summary_df = user_summary_df.join(following_count_df)
journalist_following_summary_df.fillna(0, inplace=True)
journalist_following_summary_df = journalist_following_summary_df.sort_values(['journalist_follower_count', 'followers_count'], ascending=False)
return journalist_following_summary_df
# Gender of top journalists followed by beltway journalists
def top_journalist_followed_gender_summary(followed_summary_df, head=100):
top_followed_summary_df = followed_summary_df.head(head)
return pd.DataFrame({'count': top_followed_summary_df.gender.value_counts(),
'percentage': top_followed_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
# Fields for displaying journalist mention summaries
journalist_following_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'journalist_follower_count']
# Following count
all_followed_df = pd.DataFrame(follower_to_followed_df.followed_user_id.value_counts().rename('following_count')).join(followed_screen_name_lookup_df)
all_followed_df.to_csv('output/all_followed_by_journalists.csv')
all_followed_df.head(25)
following_count | screen_name | |
---|---|---|
813286 | 1671 | BarackObama |
51241574 | 1629 | AP |
25073877 | 1613 | realDonaldTrump |
807095 | 1581 | nytimes |
2467791 | 1532 | washingtonpost |
1339835893 | 1531 | HillaryClinton |
818927131883356161 | 1522 | PressSec |
822215673812119553 | 1507 | WhiteHouse |
822215679726100480 | 1488 | POTUS |
9300262 | 1457 | politico |
30313925 | 1402 | ObamaWhiteHouse |
14246001 | 1384 | mikeallen |
93069110 | 1368 | maggieNYT |
14529929 | 1337 | jaketapper |
428333 | 1289 | cnnbrk |
1536791610 | 1279 | POTUS44 |
3108351 | 1279 | WSJ |
50325797 | 1258 | chucktodd |
113420831 | 1258 | PressSec44 |
16017475 | 1234 | NateSilver538 |
18622869 | 1231 | ezraklein |
86129724 | 1173 | costareports |
1652541 | 1144 | Reuters |
1330457336 | 1128 | billclinton |
5392522 | 1124 | NPR |
follower_to_journalist_followed_summary_df = journalist_following_summary(follower_to_journalist_followed_df)
follower_to_journalist_followed_summary_df.to_csv('output/journalists_followed_by_journalists.csv')
follower_to_journalist_followed_summary_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,337.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 1,258.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,116.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 1,107.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 1,106.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 1,082.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 1,032.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 977.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 973.00 |
85131054 | jeffzeleny | Zeleny, Jeff | CNN | M | 244114 | 970.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 915.00 |
89820928 | mitchellreports | Mitchell, Andrea | NBC News | F | 1388543 | 909.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 892.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 884.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 880.00 |
130945778 | mollyesque | Ball, Molly | The Atlantic | F | 116857 | 877.00 |
46176168 | MajorCBS | Garrett, Major | CBS News | M | 178640 | 872.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 868.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 866.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 860.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 860.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 856.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 830.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 788.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 783.00 |
journalist_followed_gender_summary(follower_to_journalist_followed_df)
count | percentage | avg_followed | |
---|---|---|---|
index | |||
M | 174283 | 62.2% | 124.04 |
F | 106057 | 37.8% | 96.42 |
follower_to_journalist_followed_summary_df[['journalist_follower_count']].describe()
journalist_follower_count | |
---|---|
count | 2,292.00 |
mean | 122.31 |
std | 161.53 |
min | 0.00 |
25% | 26.00 |
50% | 64.00 |
75% | 145.00 |
max | 1,337.00 |
top_journalist_followed_gender_summary(follower_to_journalist_followed_summary_df)
count | percentage | |
---|---|---|
M | 76 | 76.0% |
F | 24 | 24.0% |
follower_to_female_journalist_followed_df = follower_to_journalist_followed_summary_df[follower_to_journalist_followed_summary_df.gender == 'F']
follower_to_female_journalist_followed_df.to_csv('output/female_journalists_followed_by_journalists.csv')
follower_to_female_journalist_followed_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
89820928 | mitchellreports | Mitchell, Andrea | NBC News | F | 1388543 | 909.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 884.00 |
130945778 | mollyesque | Ball, Molly | The Atlantic | F | 116857 | 877.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 860.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 777.00 |
28181835 | jpaceDC | Pace, Julie | Associated Press | F | 46017 | 738.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 679.00 |
21307076 | SusanPage | Page, Susan | USA Today | F | 48675 | 670.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 664.00 |
45399148 | jeneps | Epstein, Jennifer | Bloomberg News | F | 61242 | 631.00 |
224320485 | KellyO | O’Donnell, Kelly | NBC News | F | 148476 | 630.00 |
20776497 | BFischerMartin | Fischer Martin, Betsy | Bloomberg News | F | 50890 | 609.00 |
77032777 | apalmerdc | Palmer, Anna A. | Politico | F | 30523 | 591.00 |
116341480 | RosieGray | Gray, Rosie | The Atlantic | F | 96935 | 589.00 |
237477771 | juliehdavis | Davis, Julie | New York Times | F | 49821 | 570.00 |
58869089 | margarettalev | Talev, Margaret | Bloomberg News | F | 19588 | 569.00 |
14870670 | KateNocera | Nocera, Kate | BuzzFeed | F | 27714 | 567.00 |
46817943 | brikeilarcnn | Keilar, Brianna | CNN | F | 105276 | 557.00 |
22772264 | carolelee | Lee, Carol | Wall Street Journal / Dow Jones | F | 31840 | 552.00 |
15159913 | JFKucinich | Kucinich, Jacqueline | Daily Beast | F | 31210 | 549.00 |
297532865 | kwelkernbc | Welker, Kristen | NBC News | F | 99234 | 537.00 |
15727317 | aterkel | Terkel, Amanda | Huffington Post | F | 78736 | 527.00 |
17881467 | rebeccagberg | Berg, Rebecca | RealClearPolitics | F | 48798 | 516.00 |
151444950 | DaviSusan | Davis, Susan | National Public Radio | F | 27297 | 506.00 |
27055034 | SabrinaSiddiqui | Siddiqui, Sabrina | Guardian US | F | 53835 | 474.00 |
follower_to_female_journalist_followed_df[['journalist_follower_count']].describe()
journalist_follower_count | |
---|---|
count | 993.00 |
mean | 106.80 |
std | 131.81 |
min | 0.00 |
25% | 24.00 |
50% | 59.00 |
75% | 131.00 |
max | 909.00 |
follower_to_male_journalist_followed_df = follower_to_journalist_followed_summary_df[follower_to_journalist_followed_summary_df.gender == 'M']
follower_to_male_journalist_followed_df.to_csv('output/male_journalists_followed_by_journalists.csv')
follower_to_male_journalist_followed_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,337.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 1,258.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,116.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 1,107.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 1,106.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 1,082.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 1,032.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 977.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 973.00 |
85131054 | jeffzeleny | Zeleny, Jeff | CNN | M | 244114 | 970.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 915.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 892.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 880.00 |
46176168 | MajorCBS | Garrett, Major | CBS News | M | 178640 | 872.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 868.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 866.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 860.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 856.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 830.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 788.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 783.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 781.00 |
18172905 | rickklein | Klein, Richard | ABC News | M | 109170 | 737.00 |
21768766 | jonathanweisman | Weisman, Jonathan | New York Times | M | 57549 | 728.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 728.00 |
follower_to_male_journalist_followed_df[['journalist_follower_count']].describe()
journalist_follower_count | |
---|---|
count | 1,299.00 |
mean | 134.17 |
std | 180.14 |
min | 0.00 |
25% | 28.00 |
50% | 67.00 |
75% | 156.00 |
max | 1,337.00 |
female_follower_to_journalist_followed_df = journalist_following_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'F'])
female_follower_to_journalist_followed_df.to_csv('output/journalists_followed_by_female_journalists.csv')
female_follower_to_journalist_followed_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 619.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 569.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 505.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 490.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 484.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 474.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 445.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 444.00 |
89820928 | mitchellreports | Mitchell, Andrea | NBC News | F | 1388543 | 441.00 |
85131054 | jeffzeleny | Zeleny, Jeff | CNN | M | 244114 | 435.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 434.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 430.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 420.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 402.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 398.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 397.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 394.00 |
46176168 | MajorCBS | Garrett, Major | CBS News | M | 178640 | 390.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 389.00 |
130945778 | mollyesque | Ball, Molly | The Atlantic | F | 116857 | 386.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 382.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 379.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 366.00 |
11771512 | OKnox | Knox, Olivier | Yahoo News | M | 44715 | 354.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 339.00 |
journalist_followed_gender_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'F'])
count | percentage | avg_followed | |
---|---|---|---|
index | |||
M | 73950 | 62.0% | 52.63 |
F | 45300 | 38.0% | 41.18 |
top_journalist_followed_gender_summary(female_follower_to_journalist_followed_df)
count | percentage | |
---|---|---|
M | 74 | 74.0% |
F | 26 | 26.0% |
male_follower_to_journalist_followed_df = journalist_following_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'M'])
male_follower_to_journalist_followed_df.to_csv('output/journalists_followed_by_male_journalists.csv')
male_follower_to_journalist_followed_df[journalist_following_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | journalist_follower_count | |
---|---|---|---|---|---|---|
user_id | ||||||
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 718.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 689.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 626.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 622.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 608.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 602.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 587.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 543.00 |
85131054 | jeffzeleny | Zeleny, Jeff | CNN | M | 244114 | 535.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 529.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 510.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 495.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 494.00 |
130945778 | mollyesque | Ball, Molly | The Atlantic | F | 116857 | 491.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 482.00 |
46176168 | MajorCBS | Garrett, Major | CBS News | M | 178640 | 482.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 477.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 474.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 469.00 |
89820928 | mitchellreports | Mitchell, Andrea | NBC News | F | 1388543 | 468.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 464.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 458.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 454.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 446.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 441.00 |
journalist_followed_gender_summary(follower_to_journalist_followed_df[follower_to_journalist_followed_df.gender == 'M'])
count | percentage | avg_followed | |
---|---|---|---|
index | |||
M | 100333 | 62.3% | 71.41 |
F | 60757 | 37.7% | 55.23 |
top_journalist_followed_gender_summary(male_follower_to_journalist_followed_df)
count | percentage | |
---|---|---|
M | 77 | 77.0% |
F | 23 | 23.0% |
# Mention by all
user_merge_df = user_summary_df.join(journalists_mention_summary_df[['mention_count', 'mentioning_count']])
# Mention by female
user_merge_df = user_merge_df.join(journalists_mentioned_by_female_summary_df[['mention_count', 'mentioning_count']], rsuffix='_by_female')
# Mention by male
user_merge_df = user_merge_df.join(journalists_mentioned_by_male_summary_df[['mention_count', 'mentioning_count']], rsuffix='_by_male')
# Retweet by all
user_merge_df = user_merge_df.join(journalists_retweet_summary_df[['retweet_count', 'retweeting_count']])
# Retweet by female
user_merge_df = user_merge_df.join(journalists_retweeted_by_female_summary_df[['retweet_count', 'retweeting_count']], rsuffix='_by_female')
# Retweet by male
user_merge_df = user_merge_df.join(journalists_retweeted_by_male_summary_df[['retweet_count', 'retweeting_count']], rsuffix='_by_male')
# Reply by all
user_merge_df = user_merge_df.join(journalists_reply_summary_df[['reply_to_count', 'replying_count']])
# Reply by female
user_merge_df = user_merge_df.join(journalists_replied_to_by_female_summary_df[['reply_to_count', 'replying_count']], rsuffix='_by_female')
# Reply by male
user_merge_df = user_merge_df.join(journalists_replied_to_by_male_summary_df[['reply_to_count', 'replying_count']], rsuffix='_by_male')
# Follows all
user_merge_df = user_merge_df.join(follower_to_journalist_followed_summary_df[['journalist_follower_count']])
# Follows female
user_merge_df = user_merge_df.join(female_follower_to_journalist_followed_df[['journalist_follower_count']], rsuffix='_by_female')
# Follows male
user_merge_df = user_merge_df.join(male_follower_to_journalist_followed_df[['journalist_follower_count']], rsuffix='_by_male')
user_merge_df.fillna(0, inplace=True)
# Write to CSV
user_merge_df.to_csv('output/journalists_summary.csv')
user_merge_df.count()
screen_name 2292 name 2292 organization 2292 position 2292 gender 2292 followers_count 2292 following_count 2292 tweet_count 2292 user_created_at 2292 verified 2292 protected 2292 original 2292 quote 2292 reply 2292 retweet 2292 tweets_in_dataset 2292 mention_count 2292 mentioning_count 2292 mention_count_by_female 2292 mentioning_count_by_female 2292 mention_count_by_male 2292 mentioning_count_by_male 2292 retweet_count 2292 retweeting_count 2292 retweet_count_by_female 2292 retweeting_count_by_female 2292 retweet_count_by_male 2292 retweeting_count_by_male 2292 reply_to_count 2292 replying_count 2292 reply_to_count_by_female 2292 replying_count_by_female 2292 reply_to_count_by_male 2292 replying_count_by_male 2292 journalist_follower_count 2292 journalist_follower_count_by_female 2292 journalist_follower_count_by_male 2292 dtype: int64
user_merge_df.head()
screen_name | name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | ... | retweeting_count_by_male | reply_to_count | replying_count | reply_to_count_by_female | replying_count_by_female | reply_to_count_by_male | replying_count_by_male | journalist_follower_count | journalist_follower_count_by_female | journalist_follower_count_by_male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
23455653 | abettel | Bettelheim, Adriel | Politico | Health Care Editor | F | 2664 | 1055 | 15990 | Mon Mar 09 16:32:20 +0000 2009 | True | ... | 16.00 | 3.00 | 3.00 | 0.00 | 0.00 | 3.00 | 3.00 | 179.00 | 80.00 | 99.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | White House Reporter | F | 122382 | 2342 | 12433 | Tue Apr 21 14:28:57 +0000 2009 | True | ... | 172.00 | 26.00 | 18.00 | 4.00 | 4.00 | 22.00 | 14.00 | 777.00 | 339.00 | 438.00 |
18580432 | b_fung | Fung, Brian | Washington Post | Tech Reporter | M | 16558 | 2062 | 44799 | Sat Jan 03 15:15:57 +0000 2009 | True | ... | 22.00 | 93.00 | 17.00 | 10.00 | 6.00 | 83.00 | 11.00 | 221.00 | 94.00 | 127.00 |
399225358 | b_muzz | Murray, Brendan | Bloomberg News | Managing Editor, U.S. Economy | M | 624 | 382 | 360 | Thu Oct 27 05:34:05 +0000 2011 | True | ... | 2.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 45.00 | 13.00 | 32.00 |
18834692 | becca_milfeld | Milfeld, Becca | Agence France-Presse | English Desk Editor and Journalist | F | 483 | 993 | 1484 | Sat Jan 10 13:58:43 +0000 2009 | False | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 18.00 | 9.00 | 9.00 |
5 rows × 37 columns
user_merge_df.loc['407013776']
screen_name burgessev name Everett, John B. organization Politico position Congressional Reporter gender M followers_count 31010 following_count 1782 tweet_count 27294 user_created_at Mon Nov 07 14:22:19 +0000 2011 verified True protected False original 836.00 quote 344.00 reply 275.00 retweet 218.00 tweets_in_dataset 1,673.00 mention_count 212.00 mentioning_count 46.00 mention_count_by_female 164.00 mentioning_count_by_female 20.00 mention_count_by_male 48.00 mentioning_count_by_male 26.00 retweet_count 1,836.00 retweeting_count 289.00 retweet_count_by_female 748.00 retweeting_count_by_female 122.00 retweet_count_by_male 1,088.00 retweeting_count_by_male 167.00 reply_to_count 238.00 replying_count 79.00 reply_to_count_by_female 78.00 replying_count_by_female 30.00 reply_to_count_by_male 160.00 replying_count_by_male 49.00 journalist_follower_count 570.00 journalist_follower_count_by_female 265.00 journalist_follower_count_by_male 305.00 Name: 407013776, dtype: object