%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format
def tweet_transform(tweet):
return {
'tweet_id': tweet['id_str'],
'tweet_created_at': date_parse(tweet['created_at']),
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'tweet_type': tweet_type(tweet)
}
tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'tweet_type'], dedupe_columns=['tweet_id'])
tweet_df.count()
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000 DEBUG:root:Loaded 800000
tweet_id 817136 user_id 817136 screen_name 817136 tweet_created_at 817136 tweet_type 817136 dtype: int64
tweet_df.head()
tweet_id | user_id | screen_name | tweet_created_at | tweet_type | |
---|---|---|---|---|---|
0 | 872631046088601600 | 327862439 | jonathanvswan | 2017-06-08 01:47:08+00:00 | retweet |
1 | 872610483647516673 | 327862439 | jonathanvswan | 2017-06-08 00:25:26+00:00 | retweet |
2 | 872609618626826240 | 327862439 | jonathanvswan | 2017-06-08 00:22:00+00:00 | retweet |
3 | 872605974699311104 | 327862439 | jonathanvswan | 2017-06-08 00:07:31+00:00 | retweet |
4 | 872603191518646276 | 327862439 | jonathanvswan | 2017-06-07 23:56:27+00:00 | retweet |
tweet_df.tweet_created_at.min()
Timestamp('2017-06-01 04:00:01+0000', tz='UTC')
tweet_df.tweet_created_at.max()
Timestamp('2017-08-01 03:59:58+0000', tz='UTC')
pd.DataFrame({'count':tweet_df.tweet_type.value_counts(),
'percentage':tweet_df.tweet_type.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
count | percentage | |
---|---|---|
retweet | 345266 | 42.3% |
original | 233926 | 28.6% |
reply | 126254 | 15.5% |
quote | 111690 | 13.7% |
This comes from the following sources:
Thus, the tweeter data should include tweet and user info data only from users in the user lookup.
user_lookup_filepaths = ('lookups/senate_press_lookup.csv',
'lookups/periodical_press_lookup.csv',
'lookups/radio_and_television_lookup.csv')
user_lookup_df = pd.concat((pd.read_csv(user_lookup_filepath, usecols=['Uid', 'Token'], dtype={'Uid': str}) for user_lookup_filepath in user_lookup_filepaths))
user_lookup_df.set_index('Uid', inplace=True)
user_lookup_df.rename(columns={'Token': 'screen_name'}, inplace=True)
user_lookup_df.index.names = ['user_id']
# Some users may be in multiple lists, so need to drop duplicates
user_lookup_df = user_lookup_df[~user_lookup_df.index.duplicated()]
user_lookup_df.count()
screen_name 2487 dtype: int64
user_lookup_df.head()
screen_name | |
---|---|
user_id | |
23455653 | abettel |
33919343 | AshleyRParker |
18580432 | b_fung |
399225358 | b_muzz |
18834692 | becca_milfeld |
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet
user_tweet_count_df.count()
tweet_type original 2292 quote 2292 reply 2292 retweet 2292 tweets_in_dataset 2292 dtype: int64
user_tweet_count_df.head()
tweet_type | original | quote | reply | retweet | tweets_in_dataset |
---|---|---|---|---|---|
user_id | |||||
1001991865 | 13.00 | 3.00 | 1.00 | 31.00 | 48.00 |
1002229862 | 48.00 | 20.00 | 3.00 | 118.00 | 189.00 |
100270054 | 1.00 | 0.00 | 0.00 | 0.00 | 1.00 |
100802089 | 4.00 | 7.00 | 12.00 | 17.00 | 40.00 |
100860790 | 102.00 | 26.00 | 4.00 | 166.00 | 298.00 |
user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['user_id', 'name', 'organization', 'position',
'gender', 'followers_count', 'following_count', 'tweet_count',
'user_created_at', 'verified', 'protected'],
dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()
name 2506 organization 2477 position 2503 gender 2505 followers_count 2506 following_count 2506 tweet_count 2506 user_created_at 2506 verified 2506 protected 2506 dtype: int64
user_info_df.head()
name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | |
---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||
20711445 | Glinski, Nina | NaN | Freelance Reporter | F | 963 | 507 | 909 | Thu Feb 12 20:00:53 +0000 2009 | False | False |
258917371 | Enders, David | NaN | Journalist | M | 1444 | 484 | 6296 | Mon Feb 28 19:52:03 +0000 2011 | True | False |
297046834 | Barakat, Matthew | Associated Press | Northern Virginia Correspondent | M | 759 | 352 | 631 | Wed May 11 20:55:24 +0000 2011 | True | False |
455585786 | Atkins, Kimberly | Boston Herald | Chief Washington Reporter/Columnist | F | 2944 | 2691 | 6277 | Thu Jan 05 08:26:46 +0000 2012 | True | False |
42584840 | Vlahou, Toula | CQ Roll Call | Editor & Podcast Producer | F | 2703 | 201 | 6366 | Tue May 26 07:41:38 +0000 2009 | False | False |
user_summary_df = user_lookup_df.join((user_info_df, user_tweet_count_df), how='left')
# Fill Nans
user_summary_df['organization'].fillna('', inplace=True)
user_summary_df['original'].fillna(0, inplace=True)
user_summary_df['quote'].fillna(0, inplace=True)
user_summary_df['reply'].fillna(0, inplace=True)
user_summary_df['retweet'].fillna(0, inplace=True)
user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)
user_summary_df.count()
screen_name 2487 name 2487 organization 2487 position 2484 gender 2486 followers_count 2487 following_count 2487 tweet_count 2487 user_created_at 2487 verified 2487 protected 2487 original 2487 quote 2487 reply 2487 retweet 2487 tweets_in_dataset 2487 dtype: int64
user_summary_df.head()
screen_name | name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | original | quote | reply | retweet | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||||||||
23455653 | abettel | Bettelheim, Adriel | Politico | Health Care Editor | F | 2664 | 1055 | 15990 | Mon Mar 09 16:32:20 +0000 2009 | True | False | 289.00 | 12.00 | 6.00 | 52.00 | 359.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | White House Reporter | F | 122382 | 2342 | 12433 | Tue Apr 21 14:28:57 +0000 2009 | True | False | 172.00 | 67.00 | 11.00 | 120.00 | 370.00 |
18580432 | b_fung | Fung, Brian | Washington Post | Tech Reporter | M | 16558 | 2062 | 44799 | Sat Jan 03 15:15:57 +0000 2009 | True | False | 257.00 | 85.00 | 205.00 | 82.00 | 629.00 |
399225358 | b_muzz | Murray, Brendan | Bloomberg News | Managing Editor, U.S. Economy | M | 624 | 382 | 360 | Thu Oct 27 05:34:05 +0000 2011 | True | False | 3.00 | 0.00 | 0.00 | 5.00 | 8.00 |
18834692 | becca_milfeld | Milfeld, Becca | Agence France-Presse | English Desk Editor and Journalist | F | 483 | 993 | 1484 | Sat Jan 10 13:58:43 +0000 2009 | False | False | 3.00 | 14.00 | 0.00 | 7.00 | 24.00 |
user_summary_df[user_summary_df.tweets_in_dataset == 0].count()
screen_name 195 name 195 organization 195 position 195 gender 194 followers_count 195 following_count 195 tweet_count 195 user_created_at 195 verified 195 protected 195 original 195 quote 195 reply 195 retweet 195 tweets_in_dataset 195 dtype: int64
user_summary_df = user_summary_df[user_summary_df.tweets_in_dataset != 0]
user_summary_df.count()
screen_name 2292 name 2292 organization 2292 position 2289 gender 2292 followers_count 2292 following_count 2292 tweet_count 2292 user_created_at 2292 verified 2292 protected 2292 original 2292 quote 2292 reply 2292 retweet 2292 tweets_in_dataset 2292 dtype: int64
journalist_gender_summary_df = pd.DataFrame({'count':user_summary_df.gender.value_counts(), 'percentage':user_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
journalist_gender_summary_df
count | percentage | |
---|---|---|
M | 1299 | 56.7% |
F | 993 | 43.3% |
user_summary_df[['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()
followers_count | following_count | tweet_count | original | quote | reply | retweet | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
count | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 | 2,292.00 |
mean | 16,467.62 | 1,444.83 | 9,619.69 | 102.06 | 48.73 | 55.08 | 150.64 | 356.52 |
std | 91,886.90 | 3,003.00 | 16,618.09 | 169.43 | 135.90 | 249.18 | 585.08 | 833.76 |
min | 6.00 | 0.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
25% | 831.75 | 505.75 | 1,449.50 | 10.00 | 1.00 | 1.00 | 8.00 | 32.00 |
50% | 2,419.50 | 998.50 | 4,211.50 | 41.00 | 9.00 | 5.00 | 39.00 | 122.00 |
75% | 7,348.75 | 1,713.50 | 10,817.25 | 124.25 | 43.00 | 30.00 | 129.00 | 375.00 |
max | 2,176,578.00 | 96,194.00 | 208,763.00 | 2,693.00 | 3,069.00 | 9,033.00 | 21,524.00 | 21,547.00 |
user_summary_df[user_summary_df.gender == 'F'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()
followers_count | following_count | tweet_count | original | quote | reply | retweet | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
count | 993.00 | 993.00 | 993.00 | 993.00 | 993.00 | 993.00 | 993.00 | 993.00 |
mean | 11,609.53 | 1,314.07 | 7,498.74 | 83.84 | 39.27 | 32.06 | 135.55 | 290.72 |
std | 65,563.72 | 1,250.56 | 11,312.72 | 124.86 | 135.05 | 94.73 | 724.92 | 833.07 |
min | 6.00 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
25% | 825.00 | 567.00 | 1,393.00 | 8.00 | 1.00 | 1.00 | 9.00 | 32.00 |
50% | 2,327.00 | 1,034.00 | 4,055.00 | 39.00 | 9.00 | 4.00 | 37.00 | 111.00 |
75% | 6,340.00 | 1,659.00 | 8,983.00 | 111.00 | 33.00 | 21.00 | 115.00 | 314.00 |
max | 1,388,543.00 | 18,197.00 | 118,713.00 | 1,440.00 | 3,069.00 | 1,458.00 | 21,524.00 | 21,547.00 |
user_summary_df[user_summary_df.gender == 'M'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()
followers_count | following_count | tweet_count | original | quote | reply | retweet | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
count | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 | 1,299.00 |
mean | 20,181.31 | 1,544.78 | 11,241.02 | 115.99 | 55.96 | 72.69 | 162.17 | 406.81 |
std | 107,635.37 | 3,833.89 | 19,584.46 | 195.72 | 136.16 | 319.41 | 449.75 | 831.10 |
min | 10.00 | 0.00 | 5.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
25% | 857.50 | 472.00 | 1,477.00 | 12.00 | 0.00 | 1.00 | 6.00 | 33.00 |
50% | 2,498.00 | 953.00 | 4,401.00 | 44.00 | 9.00 | 6.00 | 40.00 | 131.00 |
75% | 8,341.50 | 1,763.00 | 12,584.50 | 140.00 | 50.50 | 38.50 | 142.00 | 428.00 |
max | 2,176,578.00 | 96,194.00 | 208,763.00 | 2,693.00 | 1,955.00 | 9,033.00 | 7,528.00 | 11,432.00 |
pd.DataFrame({'count':user_summary_df.verified.value_counts(), 'percentage':user_summary_df.verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
count | percentage | |
---|---|---|
True | 1240 | 54.1% |
False | 1052 | 45.9% |
pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
count | percentage | |
---|---|---|
True | 512 | 51.6% |
False | 481 | 48.4% |
pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
count | percentage | |
---|---|---|
True | 728 | 56.0% |
False | 571 | 44.0% |
Including original tweets only
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format
# Simply the tweet on load
def mention_transform(tweet):
mentions = []
if tweet_type(tweet) == 'original':
for mention in tweet.get('entities', {}).get('user_mentions', []):
mentions.append({
'tweet_id': tweet['id_str'],
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'mention_user_id': mention['id_str'],
'mention_screen_name': mention['screen_name'],
'tweet_created_at': date_parse(tweet['created_at'])
})
return mentions
base_mention_df = load_tweet_df(mention_transform, ['tweet_id', 'user_id', 'screen_name', 'mention_user_id',
'mention_screen_name', 'tweet_created_at'],
dedupe_columns=['tweet_id', 'mention_user_id'])
base_mention_df.count()
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000 DEBUG:root:Loaded 800000
tweet_id 118210 user_id 118210 screen_name 118210 mention_user_id 118210 mention_screen_name 118210 tweet_created_at 118210 dtype: int64
base_mention_df.head()
tweet_id | user_id | screen_name | mention_user_id | mention_screen_name | tweet_created_at | |
---|---|---|---|---|---|---|
0 | 872522339962978307 | 327862439 | jonathanvswan | 800707492346925056 | axios | 2017-06-07 18:35:11+00:00 |
1 | 872484939530461184 | 327862439 | jonathanvswan | 17494010 | SenSchumer | 2017-06-07 16:06:34+00:00 |
2 | 872475140575170562 | 327862439 | jonathanvswan | 2836421 | MSNBC | 2017-06-07 15:27:37+00:00 |
3 | 872475140575170562 | 327862439 | jonathanvswan | 800707492346925056 | axios | 2017-06-07 15:27:37+00:00 |
4 | 872459457946673154 | 327862439 | jonathanvswan | 800707492346925056 | axios | 2017-06-07 14:25:18+00:00 |
mention_df = base_mention_df.join(user_summary_df['gender'], on='user_id')
mention_df.count()
tweet_id 118210 user_id 118210 screen_name 118210 mention_user_id 118210 mention_screen_name 118210 tweet_created_at 118210 gender 118210 dtype: int64
mention_df['tweet_id'].unique().size
84942
mention_df['mention_user_id'].unique().size
17730
journalists_mention_df = mention_df.join(user_summary_df['gender'], how='inner', on='mention_user_id', rsuffix='_mention')
journalists_mention_df.rename(columns = {'gender_mention': 'mention_gender'}, inplace=True)
journalists_mention_df.count()
tweet_id 14298 user_id 14298 screen_name 14298 mention_user_id 14298 mention_screen_name 14298 tweet_created_at 14298 gender 14298 mention_gender 14298 dtype: int64
journalists_mention_df.head()
tweet_id | user_id | screen_name | mention_user_id | mention_screen_name | tweet_created_at | gender | mention_gender | |
---|---|---|---|---|---|---|---|---|
16 | 870408075878027268 | 327862439 | jonathanvswan | 16031927 | greta | 2017-06-01 22:33:51+00:00 | M | F |
283 | 872581449861541893 | 19847765 | sahilkapur | 16031927 | greta | 2017-06-07 22:30:04+00:00 | M | F |
2202 | 872578055910371328 | 21252618 | JakeSherman | 16031927 | greta | 2017-06-07 22:16:34+00:00 | M | F |
15977 | 880841069243629568 | 70511174 | Hadas_Gold | 16031927 | greta | 2017-06-30 17:30:50+00:00 | F | F |
17258 | 880183952018886661 | 90077282 | politicoalex | 16031927 | greta | 2017-06-28 21:59:41+00:00 | M | F |
# Gender of beltway journalists mentioned by beltway journalists
def journalist_mention_gender_summary(mention_df):
gender_summary_df = pd.DataFrame({'count': mention_df.mention_gender.value_counts(),
'percentage': mention_df.mention_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
gender_summary_df.reset_index(inplace=True)
gender_summary_df['avg_mentions'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1)
gender_summary_df.set_index('index', inplace=True, drop=True)
return gender_summary_df
def journalist_mention_summary(mention_df):
# Mention count
mention_count_df = pd.DataFrame(mention_df.mention_user_id.value_counts().rename('mention_count'))
# Mentioning users. That is, the number of unique users mentioning each user.
mention_user_id_per_user_df = mention_df[['mention_user_id', 'user_id']].drop_duplicates()
mentioning_user_count_df = pd.DataFrame(mention_user_id_per_user_df.groupby('mention_user_id').size(), columns=['mentioning_count'])
mentioning_user_count_df.index.name = 'user_id'
# Join with user summary
journalist_mention_summary_df = user_summary_df.join([mention_count_df, mentioning_user_count_df])
journalist_mention_summary_df.fillna(0, inplace=True)
journalist_mention_summary_df = journalist_mention_summary_df.sort_values(['mention_count', 'mentioning_count', 'followers_count'], ascending=False)
return journalist_mention_summary_df
# Gender of top journalists mentioned by beltway journalists
def top_journalist_mention_gender_summary(mention_summary_df, mentioning_count_threshold=0, head=100):
top_mention_summary_df = mention_summary_df[mention_summary_df.mentioning_count > mentioning_count_threshold].head(head)
return pd.DataFrame({'count': top_mention_summary_df.gender.value_counts(),
'percentage': top_mention_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
# Fields for displaying journalist mention summaries
journalist_mention_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'mention_count', 'mentioning_count']
Note that for each of these, the complete list is being written to CSV in the output directory.
original_tweets_by_gender_df = user_summary_df[['gender', 'original']].groupby('gender').sum()
original_tweets_by_gender_df['percentage'] = original_tweets_by_gender_df.original.div(user_summary_df.original.sum()).mul(100).round(1).astype(str) + '%'
original_tweets_by_gender_df.reset_index(inplace=True)
original_tweets_by_gender_df['avg_original'] = original_tweets_by_gender_df.apply(lambda row: row['original'] / journalist_gender_summary_df.loc[row['gender']]['count'], axis=1)
original_tweets_by_gender_df.set_index('gender', inplace=True, drop=True)
original_tweets_by_gender_df
original | percentage | avg_original | |
---|---|---|---|
gender | |||
F | 83,251.00 | 35.6% | 83.84 |
M | 150,675.00 | 64.4% | 115.99 |
user_summary_df[['screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'original', 'tweets_in_dataset']].sort_values(['original'], ascending=False).head(25)
screen_name | name | organization | gender | followers_count | tweet_count | original | tweets_in_dataset | |
---|---|---|---|---|---|---|---|---|
user_id | ||||||||
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 61461 | 2,693.00 | 2,693.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 115132 | 1,858.00 | 2,089.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 92203 | 1,534.00 | 5,187.00 |
19580890 | LeeCamp | Camp, Lee | RTTV America | M | 67601 | 52051 | 1,517.00 | 3,708.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 100803 | 1,440.00 | 8,196.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 49967 | 1,332.00 | 4,414.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 148143 | 1,316.00 | 5,078.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 161148 | 1,271.00 | 2,106.00 |
36246939 | malbertnews | Albert, Mark | The Voyage Report | M | 3575 | 28230 | 1,078.00 | 1,151.00 |
117467779 | palbergo | Albergo, Paul F. | Bloomberg BNA | M | 1191 | 18083 | 1,043.00 | 1,236.00 |
102171691 | rlocker12 | Locker, Ray | USA Today | M | 3665 | 41194 | 1,038.00 | 2,496.00 |
15486163 | SimonMarksFSN | Marks, Simon | Feature Story News | M | 7767 | 41541 | 984.00 | 3,432.00 |
275207082 | AlexParkerDC | Parker, Alexander M. | Bloomberg BNA | M | 3828 | 142150 | 972.00 | 3,983.00 |
190360266 | connorobrienNH | O’Brien, Connor | Politico | M | 6158 | 17242 | 954.00 | 1,944.00 |
16031927 | greta | Van Susteren, Greta | MSNBC | F | 1186850 | 116645 | 907.00 | 4,792.00 |
300497193 | tackettdc | Tackett, R. Michael | New York Times | M | 16857 | 38620 | 896.00 | 1,041.00 |
191964162 | SamLitzinger | Litzinger, Sam | CBS News | M | 2329 | 95236 | 891.00 | 7,537.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 42497 | 885.00 | 3,960.00 |
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 208763 | 871.00 | 11,432.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 78015 | 846.00 | 6,377.00 |
27882000 | jamiedupree | Dupree, Jamie | Cox Broadcasting | M | 140848 | 46181 | 841.00 | 2,108.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 27294 | 836.00 | 1,673.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 104613 | 824.00 | 4,907.00 |
63149389 | hbwx | Bernstein, Howard | WUSA–TV | M | 8337 | 48025 | 822.00 | 1,604.00 |
13262862 | HowardMortman | Mortman, Howard | C–SPAN | M | 6211 | 38406 | 819.00 | 1,289.00 |
This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely.
# Mention count
mention_count_screen_name_df = pd.DataFrame(mention_df.mention_screen_name.value_counts().rename('mention_count'))
# Count of mentioning users
mention_user_id_per_user_screen_name_df = mention_df[['mention_screen_name', 'user_id']].drop_duplicates()
mentioning_count_screen_name_df = pd.DataFrame(mention_user_id_per_user_screen_name_df.groupby('mention_screen_name').size(), columns=['mentioning_count'])
mentioning_count_screen_name_df.index.name = 'screen_name'
all_mentioned_df = mention_count_screen_name_df.join(mentioning_count_screen_name_df)
all_mentioned_df.to_csv('output/all_mentioned_by_journalists.csv')
all_mentioned_df.head(25)
mention_count | mentioning_count | |
---|---|---|
realDonaldTrump | 2876 | 452 |
POTUS | 2265 | 253 |
wusa9 | 2111 | 41 |
AP | 1948 | 143 |
USATODAY | 1235 | 105 |
nbcwashington | 1230 | 70 |
WSJ | 1227 | 152 |
dcexaminer | 1034 | 53 |
SHSanders45 | 927 | 148 |
nytimes | 829 | 289 |
BloombergBNA | 759 | 45 |
politico | 747 | 181 |
SpeakerRyan | 700 | 181 |
Scaramucci | 657 | 198 |
PressSec | 654 | 178 |
CNN | 628 | 186 |
ABC7News | 604 | 24 |
SenJohnMcCain | 599 | 231 |
WTOP | 529 | 43 |
BloombergLaw | 517 | 15 |
VP | 506 | 140 |
SteveScalise | 505 | 150 |
MSNBC | 486 | 92 |
Reuters | 483 | 84 |
bpolitics | 432 | 69 |
all_mentioned_df.sort_values(['mentioning_count', 'mention_count'], ascending=False).head(25)
mention_count | mentioning_count | |
---|---|---|
realDonaldTrump | 2876 | 452 |
nytimes | 829 | 289 |
POTUS | 2265 | 253 |
SenJohnMcCain | 599 | 231 |
Scaramucci | 657 | 198 |
CNN | 628 | 186 |
politico | 747 | 181 |
SpeakerRyan | 700 | 181 |
PressSec | 654 | 178 |
washingtonpost | 413 | 154 |
WSJ | 1227 | 152 |
SteveScalise | 505 | 150 |
SHSanders45 | 927 | 148 |
AP | 1948 | 143 |
VP | 506 | 140 |
SenateMajLdr | 412 | 120 |
DonaldJTrumpJr | 199 | 110 |
RandPaul | 206 | 107 |
USATODAY | 1235 | 105 |
LindseyGrahamSC | 253 | 105 |
SenSchumer | 265 | 97 |
NancyPelosi | 266 | 95 |
MSNBC | 486 | 92 |
CNNPolitics | 329 | 91 |
MarkWarner | 204 | 89 |
journalists_mention_summary_df = journalist_mention_summary(journalists_mention_df)
journalists_mention_summary_df.to_csv('output/journalists_mentioned_by_journalists.csv')
journalists_mention_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
325050734 | AllysonRaeWx | Banks, Allyson | WUSA–TV | F | 6918 | 330.00 | 7.00 |
28496589 | TenaciousTopper | Shutt, Charles | WUSA–TV | M | 15868 | 239.00 | 13.00 |
63149389 | hbwx | Bernstein, Howard | WUSA–TV | M | 8337 | 235.00 | 10.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 212.00 | 46.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 200.00 | 31.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 143.00 | 41.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 127.00 | 51.00 |
169586280 | WaPoSean | Sullivan, Sean | Washington Post | M | 22860 | 117.00 | 20.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 116.00 | 47.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 115.00 | 55.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 109.00 | 22.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 107.00 | 43.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 106.00 | 42.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 105.00 | 27.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 104.00 | 40.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 100.00 | 31.00 |
9126752 | reporterjoe | Gould, Joseph M. | Sightline Media Group | M | 4702 | 98.00 | 16.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 95.00 | 43.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 87.00 | 14.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 84.00 | 41.00 |
26632935 | HopeSeck | Hodge Seck, Hope | Military.com | F | 4584 | 83.00 | 3.00 |
48802204 | HardballChris | Matthews, Chris | NBC News | M | 718330 | 80.00 | 9.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 78.00 | 37.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 78.00 | 27.00 |
24439201 | jameshohmann | Hohmann, James P. | Washington Post | M | 38708 | 78.00 | 27.00 |
journalists_mention_summary_df[journalist_mention_summary_fields].sort_values(['mentioning_count', 'mention_count'], ascending=False).head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 115.00 | 55.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 127.00 | 51.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 116.00 | 47.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 212.00 | 46.00 |
112526560 | kenvogel | Vogel, Kenneth P. | Politico | M | 53894 | 67.00 | 45.00 |
18227519 | morningmika | Brzezinski, Mika | MSNBC | F | 653031 | 70.00 | 44.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 107.00 | 43.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 95.00 | 43.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 106.00 | 42.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 143.00 | 41.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 84.00 | 41.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 104.00 | 40.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 61.00 | 38.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 78.00 | 37.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 75.00 | 37.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 43.00 | 32.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 200.00 | 31.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 100.00 | 31.00 |
50325797 | chucktodd | Todd, Chuck | NBC News | M | 1781247 | 40.00 | 31.00 |
71294756 | wolfblitzer | Blitzer, Wolf | CNN | M | 1281914 | 56.00 | 30.00 |
28181835 | jpaceDC | Pace, Julie | Associated Press | F | 46017 | 52.00 | 30.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 67.00 | 29.00 |
16031927 | greta | Van Susteren, Greta | MSNBC | F | 1186850 | 37.00 | 28.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 105.00 | 27.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 78.00 | 27.00 |
journalist_mention_gender_summary(journalists_mention_df)
count | percentage | avg_mentions | |
---|---|---|---|
index | |||
M | 8298 | 58.0% | 6.39 |
F | 6000 | 42.0% | 6.04 |
journalists_mention_summary_df[['mention_count']].describe()
mention_count | |
---|---|
count | 2,292.00 |
mean | 6.24 |
std | 17.59 |
min | 0.00 |
25% | 0.00 |
50% | 1.00 |
75% | 5.00 |
max | 330.00 |
female_journalists_mention_summary_df = journalists_mention_summary_df[journalists_mention_summary_df.gender == 'F']
female_journalists_mention_summary_df.to_csv('output/female_journalists_mentioned_by_journalists.csv')
female_journalists_mention_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
325050734 | AllysonRaeWx | Banks, Allyson | WUSA–TV | F | 6918 | 330.00 | 7.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 200.00 | 31.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 143.00 | 41.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 115.00 | 55.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 109.00 | 22.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 100.00 | 31.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 87.00 | 14.00 |
26632935 | HopeSeck | Hodge Seck, Hope | Military.com | F | 4584 | 83.00 | 3.00 |
16441088 | jestei | Steinhauer, Jennifer | New York Times | F | 13452 | 76.00 | 26.00 |
18227519 | morningmika | Brzezinski, Mika | MSNBC | F | 653031 | 70.00 | 44.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 67.00 | 29.00 |
139738464 | mj_lee | Lee, MJ | CNN | F | 31940 | 67.00 | 27.00 |
204599219 | pw_cunningham | Cunningham, Paige | Washington Examiner | F | 9255 | 67.00 | 18.00 |
118747545 | eilperin | Eilperin, Juliet | Washington Post | F | 20483 | 67.00 | 16.00 |
360080772 | FoxReports | Fox, Lauren | CNN | F | 7282 | 65.00 | 15.00 |
58869089 | margarettalev | Talev, Margaret | Bloomberg News | F | 19588 | 58.00 | 27.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 58.00 | 5.00 |
19734832 | sarahkliff | Kliff, Sarah L. | Vox Media | F | 100090 | 57.00 | 27.00 |
381664207 | caitlinnowens | Owens, Caitlin N. | Axios | F | 5749 | 57.00 | 9.00 |
167024520 | rachaelmbade | Bade, Rachel M. | Politico | F | 30164 | 56.00 | 26.00 |
247852986 | rachanadixit | Pradhan, Rachana D. | Politico | F | 6178 | 55.00 | 14.00 |
237477771 | juliehdavis | Davis, Julie | New York Times | F | 49821 | 55.00 | 10.00 |
36607254 | Oriana0214 | Pawlyk, Oriana | Military.com | F | 6397 | 55.00 | 4.00 |
28181835 | jpaceDC | Pace, Julie | Associated Press | F | 46017 | 52.00 | 30.00 |
48144950 | JudyWoodruff | Woodruff, Judy | PBS NewsHour | F | 64294 | 49.00 | 7.00 |
female_journalists_mention_summary_df[['mention_count']].describe()
mention_count | |
---|---|
count | 993.00 |
mean | 6.04 |
std | 17.95 |
min | 0.00 |
25% | 0.00 |
50% | 1.00 |
75% | 4.00 |
max | 330.00 |
male_journalists_mention_summary_df = journalists_mention_summary_df[journalists_mention_summary_df.gender == 'M']
male_journalists_mention_summary_df.to_csv('output/male_journalists_mentioned_by_journalists.csv')
male_journalists_mention_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
28496589 | TenaciousTopper | Shutt, Charles | WUSA–TV | M | 15868 | 239.00 | 13.00 |
63149389 | hbwx | Bernstein, Howard | WUSA–TV | M | 8337 | 235.00 | 10.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 212.00 | 46.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 127.00 | 51.00 |
169586280 | WaPoSean | Sullivan, Sean | Washington Post | M | 22860 | 117.00 | 20.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 116.00 | 47.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 107.00 | 43.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 106.00 | 42.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 105.00 | 27.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 104.00 | 40.00 |
9126752 | reporterjoe | Gould, Joseph M. | Sightline Media Group | M | 4702 | 98.00 | 16.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 95.00 | 43.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 84.00 | 41.00 |
48802204 | HardballChris | Matthews, Chris | NBC News | M | 718330 | 80.00 | 9.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 78.00 | 37.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 78.00 | 27.00 |
24439201 | jameshohmann | Hohmann, James P. | Washington Post | M | 38708 | 78.00 | 27.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 75.00 | 37.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 73.00 | 15.00 |
112526560 | kenvogel | Vogel, Kenneth P. | Politico | M | 53894 | 67.00 | 45.00 |
18646108 | BretBaier | Baier, Bret | Fox News | M | 1095184 | 66.00 | 18.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 61.00 | 38.00 |
16067683 | pauldemko | Demko, Paul Jeffrey | Politico | M | 8170 | 60.00 | 13.00 |
59676104 | danbalz | Balz, Daniel | Washington Post | M | 90819 | 57.00 | 26.00 |
71294756 | wolfblitzer | Blitzer, Wolf | CNN | M | 1281914 | 56.00 | 30.00 |
male_journalists_mention_summary_df[['mention_count']].describe()
mention_count | |
---|---|
count | 1,299.00 |
mean | 6.39 |
std | 17.31 |
min | 0.00 |
25% | 0.00 |
50% | 1.00 |
75% | 5.00 |
max | 239.00 |
journalists_mentioned_by_female_summary_df = journalist_mention_summary(journalists_mention_df[journalists_mention_df.gender == 'F'])
journalists_mentioned_by_female_summary_df.to_csv('output/journalists_mentioned_by_female_journalists.csv')
journalists_mentioned_by_female_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 164.00 | 20.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 116.00 | 13.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 79.00 | 10.00 |
169586280 | WaPoSean | Sullivan, Sean | Washington Post | M | 22860 | 71.00 | 11.00 |
48802204 | HardballChris | Matthews, Chris | NBC News | M | 718330 | 70.00 | 3.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 64.00 | 16.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 61.00 | 6.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 60.00 | 26.00 |
16067683 | pauldemko | Demko, Paul Jeffrey | Politico | M | 8170 | 57.00 | 10.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 53.00 | 2.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 52.00 | 8.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 49.00 | 11.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 47.00 | 10.00 |
247852986 | rachanadixit | Pradhan, Rachana D. | Politico | F | 6178 | 43.00 | 7.00 |
9126752 | reporterjoe | Gould, Joseph M. | Sightline Media Group | M | 4702 | 43.00 | 7.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 40.00 | 21.00 |
16930125 | edatpost | O’Keefe, Edward | Washington Post | M | 58670 | 40.00 | 18.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 37.00 | 13.00 |
16149614 | jrovner | Rovner, Julie | Kaiser Health News | F | 21844 | 35.00 | 14.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 35.00 | 13.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 35.00 | 12.00 |
158072303 | ValerieInsinna | Insinna, Valerie | Defense News | F | 4572 | 35.00 | 2.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 33.00 | 18.00 |
342226913 | GregStohr | Stohr, Greg | Bloomberg News | M | 7245 | 32.00 | 2.00 |
297532865 | kwelkernbc | Welker, Kristen | NBC News | F | 99234 | 31.00 | 9.00 |
journalist_mention_gender_summary(journalists_mention_df[journalists_mention_df.gender == 'F'])
count | percentage | avg_mentions | |
---|---|---|---|
index | |||
M | 3162 | 54.8% | 2.43 |
F | 2605 | 45.2% | 2.62 |
journalists_mentioned_by_male_summary_df = journalist_mention_summary(journalists_mention_df[journalists_mention_df.gender == 'M'])
journalists_mentioned_by_male_summary_df.to_csv('output/journalists_mentioned_by_male_journalists.csv')
journalists_mentioned_by_male_summary_df[journalist_mention_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | mention_count | mentioning_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
325050734 | AllysonRaeWx | Banks, Allyson | WUSA–TV | F | 6918 | 324.00 | 4.00 |
28496589 | TenaciousTopper | Shutt, Charles | WUSA–TV | M | 15868 | 225.00 | 7.00 |
63149389 | hbwx | Bernstein, Howard | WUSA–TV | M | 8337 | 225.00 | 4.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 87.00 | 30.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 84.00 | 30.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 84.00 | 18.00 |
997684836 | pkcapitol | Kane, Paul | Washington Post | M | 31300 | 81.00 | 34.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 79.00 | 25.00 |
123327472 | peterbakernyt | Baker, Peter | New York Times | M | 96956 | 78.00 | 29.00 |
26632935 | HopeSeck | Hodge Seck, Hope | Military.com | F | 4584 | 76.00 | 1.00 |
15931637 | jonkarl | Karl, Jonathan | ABC News | M | 183467 | 71.00 | 22.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 69.00 | 31.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 67.00 | 27.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 66.00 | 29.00 |
16441088 | jestei | Steinhauer, Jennifer | New York Times | F | 13452 | 64.00 | 17.00 |
82151660 | kelsey_snell | Snell, Kelse | Washington Post | F | 8108 | 62.00 | 12.00 |
24439201 | jameshohmann | Hohmann, James P. | Washington Post | M | 38708 | 59.00 | 17.00 |
18646108 | BretBaier | Baier, Bret | Fox News | M | 1095184 | 59.00 | 14.00 |
108617810 | DanaBashCNN | Bash, Dana | CNN | F | 281861 | 55.00 | 29.00 |
9126752 | reporterjoe | Gould, Joseph M. | Sightline Media Group | M | 4702 | 55.00 | 9.00 |
381664207 | caitlinnowens | Owens, Caitlin N. | Axios | F | 5749 | 55.00 | 7.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 51.00 | 20.00 |
204599219 | pw_cunningham | Cunningham, Paige | Washington Examiner | F | 9255 | 51.00 | 9.00 |
112526560 | kenvogel | Vogel, Kenneth P. | Politico | M | 53894 | 50.00 | 32.00 |
36607254 | Oriana0214 | Pawlyk, Oriana | Military.com | F | 6397 | 50.00 | 3.00 |
journalist_mention_gender_summary(journalists_mention_df[journalists_mention_df.gender == 'M'])
count | percentage | avg_mentions | |
---|---|---|---|
index | |||
M | 5136 | 60.2% | 3.95 |
F | 3395 | 39.8% | 3.42 |
Including retweets and quotes
# Simply the tweet on load
def retweet_transform(tweet):
if tweet_type(tweet) in ('retweet', 'quote'):
retweet = tweet.get('retweeted_status') or tweet.get('quoted_status')
return {
'tweet_id': tweet['id_str'],
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'retweet_user_id': retweet['user']['id_str'],
'retweet_screen_name': retweet['user']['screen_name'],
'tweet_created_at': date_parse(tweet['created_at'])
}
return None
base_retweet_df = load_tweet_df(retweet_transform, ['tweet_id', 'user_id', 'screen_name', 'retweet_user_id',
'retweet_screen_name', 'tweet_created_at'],
dedupe_columns=['tweet_id'])
base_retweet_df.count()
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000 DEBUG:root:Loaded 800000
tweet_id 456956 user_id 456956 screen_name 456956 retweet_user_id 456956 retweet_screen_name 456956 tweet_created_at 456956 dtype: int64
base_retweet_df.head()
tweet_id | user_id | screen_name | retweet_user_id | retweet_screen_name | tweet_created_at | |
---|---|---|---|---|---|---|
0 | 872631046088601600 | 327862439 | jonathanvswan | 93069110 | maggieNYT | 2017-06-08 01:47:08+00:00 |
1 | 872610483647516673 | 327862439 | jonathanvswan | 160951141 | TomNamako | 2017-06-08 00:25:26+00:00 |
2 | 872609618626826240 | 327862439 | jonathanvswan | 18678924 | jmartNYT | 2017-06-08 00:22:00+00:00 |
3 | 872605974699311104 | 327862439 | jonathanvswan | 93069110 | maggieNYT | 2017-06-08 00:07:31+00:00 |
4 | 872603191518646276 | 327862439 | jonathanvswan | 94784682 | JonathanTurley | 2017-06-07 23:56:27+00:00 |
retweet_df = base_retweet_df.join(user_summary_df['gender'], on='user_id')
retweet_df.count()
tweet_id 456956 user_id 456956 screen_name 456956 retweet_user_id 456956 retweet_screen_name 456956 tweet_created_at 456956 gender 456956 dtype: int64
retweet_df['retweet_user_id'].unique().size
49154
journalists_retweet_df = retweet_df.join(user_summary_df['gender'], how='inner', on='retweet_user_id', rsuffix='_retweet')
journalists_retweet_df.rename(columns = {'gender_retweet': 'retweet_gender'}, inplace=True)
journalists_retweet_df.count()
tweet_id 117048 user_id 117048 screen_name 117048 retweet_user_id 117048 retweet_screen_name 117048 tweet_created_at 117048 gender 117048 retweet_gender 117048 dtype: int64
journalists_retweet_df.head()
tweet_id | user_id | screen_name | retweet_user_id | retweet_screen_name | tweet_created_at | gender | retweet_gender | |
---|---|---|---|---|---|---|---|---|
2 | 872609618626826240 | 327862439 | jonathanvswan | 18678924 | jmartNYT | 2017-06-08 00:22:00+00:00 | M | M |
435 | 871437820044464128 | 242169927 | colinwilhelm | 18678924 | jmartNYT | 2017-06-04 18:45:41+00:00 | M | M |
1406 | 872620054889857024 | 163589845 | PoliticoKevin | 18678924 | jmartNYT | 2017-06-08 01:03:28+00:00 | M | M |
1424 | 872240756597174272 | 163589845 | PoliticoKevin | 18678924 | jmartNYT | 2017-06-06 23:56:16+00:00 | M | M |
1455 | 870749993279385601 | 163589845 | PoliticoKevin | 18678924 | jmartNYT | 2017-06-02 21:12:30+00:00 | M | M |
# Gender of beltway journalists retweeted by beltway journalists
def journalist_retweet_gender_summary(retweet_df):
gender_summary_df = pd.DataFrame({'count':retweet_df.retweet_gender.value_counts(),
'percentage': retweet_df.retweet_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
gender_summary_df.reset_index(inplace=True)
gender_summary_df['avg_retweets'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1)
gender_summary_df.set_index('index', inplace=True, drop=True)
return gender_summary_df
def journalist_retweet_summary(retweet_df):
# Retweet count
retweet_count_df = pd.DataFrame(retweet_df.retweet_user_id.value_counts().rename('retweet_count'))
# Retweeting users. That is, the number of unique users retweeting each user.
retweet_user_id_per_user_df = retweet_df[['retweet_user_id', 'user_id']].drop_duplicates()
retweeting_user_count_df = pd.DataFrame(retweet_user_id_per_user_df.groupby('retweet_user_id').size(), columns=['retweeting_count'])
retweeting_user_count_df.index.name = 'user_id'
# Join with user summary
journalist_retweet_summary_df = user_summary_df.join([retweet_count_df, retweeting_user_count_df])
journalist_retweet_summary_df.fillna(0, inplace=True)
journalist_retweet_summary_df = journalist_retweet_summary_df.sort_values(['retweet_count', 'retweeting_count', 'followers_count'], ascending=False)
return journalist_retweet_summary_df
# Gender of top journalists retweeted by beltway journalists
def top_journalist_retweet_gender_summary(retweet_summary_df, retweeting_count_threshold=0, head=100):
top_retweet_summary_df = retweet_summary_df[retweet_summary_df.retweeting_count > retweeting_count_threshold].head(head)
return pd.DataFrame({'count': top_retweet_summary_df.gender.value_counts(),
'percentage': top_retweet_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
# Fields for displaying journalist mention summaries
journalist_retweet_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'retweet_count', 'retweeting_count']
Note that for each of these, the complete list is being written to CSV in the output directory.
That is, by gender of retweeter.
retweets_by_gender_df = user_summary_df[['gender', 'retweet', 'quote']].groupby('gender').sum()
retweets_by_gender_df['total'] = retweets_by_gender_df.retweet + retweets_by_gender_df.quote
retweets_by_gender_df['percentage'] = retweets_by_gender_df.total.div(retweets_by_gender_df.total.sum()).mul(100).round(1).astype(str) + '%'
retweets_by_gender_df.reset_index(inplace=True)
retweets_by_gender_df['avg_retweets'] = retweets_by_gender_df.apply(lambda row: row['total'] / journalist_gender_summary_df.loc[row['gender']]['count'], axis=1)
retweets_by_gender_df.set_index('gender', inplace=True, drop=True)
retweets_by_gender_df
retweet | quote | total | percentage | avg_retweets | |
---|---|---|---|---|---|
gender | |||||
F | 134,606.00 | 38,998.00 | 173,604.00 | 38.0% | 174.83 |
M | 210,660.00 | 72,692.00 | 283,352.00 | 62.0% | 218.13 |
retweet_user_summary_df = user_summary_df.loc[:,('screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'retweet', 'quote', 'tweets_in_dataset')]
retweet_user_summary_df['retweet_count'] = retweet_user_summary_df.retweet + retweet_user_summary_df.quote
retweet_user_summary_df.sort_values(['retweet_count'], ascending=False).head(25)
screen_name | name | organization | gender | followers_count | tweet_count | retweet | quote | tweets_in_dataset | retweet_count | |
---|---|---|---|---|---|---|---|---|---|---|
user_id | ||||||||||
2453025128 | gloriaminott | Minott, Gloria | WPFW–FM | F | 586 | 61473 | 21,524.00 | 0.00 | 21,547.00 | 21,524.00 |
304988603 | NeilWMcCabe | McCabe, Neil | Breitbart News | M | 18903 | 64673 | 7,528.00 | 625.00 | 9,370.00 | 8,153.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 100803 | 4,449.00 | 1,834.00 | 8,196.00 | 6,283.00 |
191964162 | SamLitzinger | Litzinger, Sam | CBS News | M | 2329 | 95236 | 6,017.00 | 225.00 | 7,537.00 | 6,242.00 |
21612122 | HotlineJosh | Kraushaar, Josh P. | National Journal | M | 50438 | 156610 | 4,881.00 | 893.00 | 6,703.00 | 5,774.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 78015 | 4,570.00 | 822.00 | 6,377.00 | 5,392.00 |
16031927 | greta | Van Susteren, Greta | MSNBC | F | 1186850 | 116645 | 794.00 | 3,069.00 | 4,792.00 | 3,863.00 |
21810329 | sdonnan | Donnan, Shawn | Financial Times | M | 12311 | 79125 | 3,332.00 | 449.00 | 4,537.00 | 3,781.00 |
47408060 | JonathanLanday | Landay, Jonathan | McClatchy Newspapers | M | 11213 | 81042 | 3,687.00 | 80.00 | 4,285.00 | 3,767.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 169908 | 2,703.00 | 859.00 | 4,564.00 | 3,562.00 |
21696279 | brianbeutler | Beutler, Brian Alfred | New Republic | M | 74435 | 99050 | 2,694.00 | 684.00 | 4,560.00 | 3,378.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 104613 | 1,377.00 | 1,955.00 | 4,907.00 | 3,332.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 49967 | 1,740.00 | 1,327.00 | 4,414.00 | 3,067.00 |
456994513 | maria_e_recio | Recio, Maria | Austin American-Statesman | F | 1072 | 40822 | 2,613.00 | 336.00 | 3,370.00 | 2,949.00 |
19576571 | JaredRizzi | Rizzi, Jared | Sirius XM Satellite Radio | M | 13545 | 41620 | 2,112.00 | 828.00 | 5,567.00 | 2,940.00 |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | M | 20947 | 92203 | 2,231.00 | 521.00 | 5,187.00 | 2,752.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 148143 | 2,435.00 | 287.00 | 5,078.00 | 2,722.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 27573 | 2,505.00 | 184.00 | 2,871.00 | 2,689.00 |
19545932 | kampeas | Kampeas, Ron | Jewish Telegraphic Agency | M | 6977 | 53053 | 1,988.00 | 444.00 | 3,249.00 | 2,432.00 |
42352386 | rschles | Schlesinger, Robert | U.S. News & World Report | M | 4553 | 35375 | 1,644.00 | 617.00 | 2,459.00 | 2,261.00 |
25702314 | EricMGarcia | Garcia, Eric M. | CQ Roll Call | M | 3094 | 44783 | 528.00 | 1,723.00 | 3,584.00 | 2,251.00 |
18646108 | BretBaier | Baier, Bret | Fox News | M | 1095184 | 52271 | 1,623.00 | 615.00 | 2,379.00 | 2,238.00 |
15486163 | SimonMarksFSN | Marks, Simon | Feature Story News | M | 7767 | 41541 | 1,296.00 | 934.00 | 3,432.00 | 2,230.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 106970 | 1,665.00 | 467.00 | 2,810.00 | 2,132.00 |
15730608 | edroso | Edroso, Roy | UCG | M | 4696 | 38064 | 1,714.00 | 379.00 | 2,883.00 | 2,093.00 |
This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely.
# Retweet count
retweet_count_screen_name_df = pd.DataFrame(retweet_df.retweet_screen_name.value_counts().rename('retweet_count'))
# Count of retweeting users
retweet_user_id_per_user_screen_name_df = retweet_df[['retweet_screen_name', 'user_id']].drop_duplicates()
retweeting_count_screen_name_df = pd.DataFrame(retweet_user_id_per_user_screen_name_df.groupby('retweet_screen_name').size(), columns=['retweeting_count'])
retweeting_count_screen_name_df.index.name = 'screen_name'
all_retweeted_df = retweet_count_screen_name_df.join(retweeting_count_screen_name_df)
all_retweeted_df.to_csv('output/all_retweeted_by_journalists.csv')
all_retweeted_df.head(25)
retweet_count | retweeting_count | |
---|---|---|
realDonaldTrump | 6650 | 807 |
thehill | 5424 | 457 |
BraddJaffy | 3564 | 554 |
maggieNYT | 3024 | 530 |
business | 3000 | 229 |
washingtonpost | 2638 | 498 |
AP | 2480 | 581 |
politico | 2335 | 334 |
nytimes | 2268 | 485 |
WSJ | 1949 | 213 |
burgessev | 1836 | 289 |
kylegriffin1 | 1803 | 429 |
ZekeJMiller | 1723 | 387 |
CNN | 1602 | 366 |
GlennThrush | 1577 | 451 |
Reuters | 1487 | 265 |
jaketapper | 1459 | 397 |
TheEconomist | 1458 | 86 |
StevenTDennis | 1403 | 280 |
FoxNews | 1400 | 258 |
seungminkim | 1393 | 327 |
mkraju | 1359 | 341 |
PhilipRucker | 1349 | 365 |
markknoller | 1343 | 341 |
MEPFuller | 1324 | 286 |
journalists_retweet_summary_df = journalist_retweet_summary(journalists_retweet_df)
journalists_retweet_summary_df.to_csv('output/journalists_retweeted_by_journalists.csv')
journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 1,836.00 | 289.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 1,723.00 | 387.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,577.00 | 451.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,459.00 | 397.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 1,403.00 | 280.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 1,393.00 | 327.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 1,359.00 | 341.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 1,343.00 | 341.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 1,324.00 | 286.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 1,221.00 | 306.00 |
14007532 | frankthorp | Thorp, Frank | NBC News | M | 39798 | 1,207.00 | 334.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 1,186.00 | 296.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 1,177.00 | 297.00 |
104914594 | Phil_Mattingly | Mattingly, Phil | CNN | M | 40119 | 1,120.00 | 314.00 |
16006592 | BenjySarlin | Sarlin, Benjamin | NBC News | M | 78075 | 1,039.00 | 215.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 1,011.00 | 277.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 943.00 | 281.00 |
33653195 | ericawerner | Werner, Erica | Associated Press | F | 14049 | 939.00 | 281.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 916.00 | 247.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 909.00 | 388.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 849.00 | 306.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 829.00 | 315.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 770.00 | 193.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 708.00 | 13.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 705.00 | 155.00 |
journalist_retweet_gender_summary(journalists_retweet_df)
count | percentage | avg_retweets | |
---|---|---|---|
index | |||
M | 80634 | 68.9% | 62.07 |
F | 36414 | 31.1% | 36.67 |
journalists_retweet_summary_df[['retweet_count']].describe()
retweet_count | |
---|---|
count | 2,292.00 |
mean | 51.07 |
std | 149.06 |
min | 0.00 |
25% | 0.00 |
50% | 6.00 |
75% | 33.00 |
max | 1,836.00 |
female_journalists_retweet_summary_df = journalists_retweet_summary_df[journalists_retweet_summary_df.gender == 'F']
female_journalists_retweet_summary_df.to_csv('output/female_journalists_retweeted_by_journalists.csv')
female_journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 1,393.00 | 327.00 |
33653195 | ericawerner | Werner, Erica | Associated Press | F | 14049 | 939.00 | 281.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 909.00 | 388.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 849.00 | 306.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 708.00 | 13.00 |
167024520 | rachaelmbade | Bade, Rachel M. | Politico | F | 30164 | 614.00 | 161.00 |
33919343 | AshleyRParker | Parker, Ashley | Washington Post | F | 122382 | 539.00 | 268.00 |
139738464 | mj_lee | Lee, MJ | CNN | F | 31940 | 518.00 | 189.00 |
16018516 | jenhab | Haberkorn, Jennifer A. | Politico | F | 20028 | 474.00 | 136.00 |
18825339 | CahnEmily | Cahn, Emily | Mic | F | 16980 | 444.00 | 118.00 |
45399148 | jeneps | Epstein, Jennifer | Bloomberg News | F | 61242 | 443.00 | 189.00 |
705706292 | rebeccaballhaus | Ballhaus, Rebecca | Wall Street Journal / Dow Jones | F | 24638 | 409.00 | 154.00 |
19734832 | sarahkliff | Kliff, Sarah L. | Vox Media | F | 100090 | 392.00 | 136.00 |
163995093 | AlexNBCNews | Moe, Alexandra | NBC News | F | 21689 | 388.00 | 134.00 |
237477771 | juliehdavis | Davis, Julie | New York Times | F | 49821 | 375.00 | 194.00 |
16149614 | jrovner | Rovner, Julie | Kaiser Health News | F | 21844 | 351.00 | 137.00 |
116341480 | RosieGray | Gray, Rosie | The Atlantic | F | 96935 | 345.00 | 125.00 |
28181835 | jpaceDC | Pace, Julie | Associated Press | F | 46017 | 328.00 | 132.00 |
52392666 | ZoeTillman | Tillman, Zoe | BuzzFeed | F | 15246 | 312.00 | 70.00 |
906734342 | KimberlyRobinsn | Robinson, Kimberly S. | Bloomberg BNA | F | 7170 | 308.00 | 38.00 |
188857501 | alexis_levinson | Levinson, Alexis R. | BuzzFeed | F | 25375 | 288.00 | 111.00 |
56552341 | LACaldwellDC | Caldwell, Leigh Ann | NBC News | F | 8464 | 282.00 | 98.00 |
151444950 | DaviSusan | Davis, Susan | National Public Radio | F | 27297 | 270.00 | 150.00 |
360080772 | FoxReports | Fox, Lauren | CNN | F | 7282 | 269.00 | 116.00 |
313545488 | LauraLitvan | Litvan, Laura | Bloomberg News | F | 4468 | 269.00 | 115.00 |
female_journalists_retweet_summary_df[['retweet_count']].describe()
retweet_count | |
---|---|
count | 993.00 |
mean | 36.67 |
std | 97.34 |
min | 0.00 |
25% | 0.00 |
50% | 5.00 |
75% | 25.00 |
max | 1,393.00 |
male_journalists_retweet_summary_df = journalists_retweet_summary_df[journalists_retweet_summary_df.gender == 'M']
male_journalists_retweet_summary_df.to_csv('output/male_journalists_retweeted_by_journalists.csv')
male_journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 1,836.00 | 289.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 1,723.00 | 387.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,577.00 | 451.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,459.00 | 397.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 1,403.00 | 280.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 1,359.00 | 341.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 1,343.00 | 341.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 1,324.00 | 286.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 1,221.00 | 306.00 |
14007532 | frankthorp | Thorp, Frank | NBC News | M | 39798 | 1,207.00 | 334.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 1,186.00 | 296.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 1,177.00 | 297.00 |
104914594 | Phil_Mattingly | Mattingly, Phil | CNN | M | 40119 | 1,120.00 | 314.00 |
16006592 | BenjySarlin | Sarlin, Benjamin | NBC News | M | 78075 | 1,039.00 | 215.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 1,011.00 | 277.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 943.00 | 281.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 916.00 | 247.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 829.00 | 315.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 770.00 | 193.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 705.00 | 155.00 |
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 704.00 | 225.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 699.00 | 223.00 |
22129280 | jimsciutto | Sciutto, James | CNN | M | 172012 | 688.00 | 242.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 654.00 | 284.00 |
15463671 | samstein | Stein, Sam | Huffington Post | M | 313211 | 642.00 | 229.00 |
male_journalists_retweet_summary_df[['retweet_count']].describe()
retweet_count | |
---|---|
count | 1,299.00 |
mean | 62.07 |
std | 178.04 |
min | 0.00 |
25% | 1.00 |
50% | 8.00 |
75% | 39.50 |
max | 1,836.00 |
journalists_retweeted_by_female_summary_df = journalist_retweet_summary(journalists_retweet_df[journalists_retweet_df.gender == 'F'])
journalists_retweeted_by_female_summary_df.to_csv('output/journalists_retweeted_by_female_journalists.csv')
journalists_retweeted_by_female_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 748.00 | 122.00 |
593813785 | DonnaYoungDC | Young, Donna | S&P Global Market Intelligence | F | 5894 | 704.00 | 9.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 572.00 | 142.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 549.00 | 140.00 |
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 516.00 | 149.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 503.00 | 97.00 |
14007532 | frankthorp | Thorp, Frank | NBC News | M | 39798 | 470.00 | 140.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 463.00 | 165.00 |
33653195 | ericawerner | Werner, Erica | Associated Press | F | 14049 | 452.00 | 119.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 447.00 | 116.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 403.00 | 132.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 388.00 | 158.00 |
104914594 | Phil_Mattingly | Mattingly, Phil | CNN | M | 40119 | 372.00 | 129.00 |
118130765 | dylanlscott | Scott, Dylan L. | Stat News | M | 20122 | 367.00 | 67.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 365.00 | 122.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 344.00 | 164.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 338.00 | 103.00 |
167024520 | rachaelmbade | Bade, Rachel M. | Politico | F | 30164 | 303.00 | 59.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 302.00 | 106.00 |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | M | 83316 | 287.00 | 61.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 279.00 | 111.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 265.00 | 119.00 |
139738464 | mj_lee | Lee, MJ | CNN | F | 31940 | 259.00 | 79.00 |
217550862 | BresPolitico | Bresnahan, John | Politico | M | 40562 | 256.00 | 82.00 |
61734492 | Fahrenthold | Fahrenthold, David | Washington Post | M | 451778 | 253.00 | 115.00 |
Average is of female journalists retweeting other journalists, how many retweets does each male / female journalist receive.
journalist_retweet_gender_summary(journalists_retweet_df[journalists_retweet_df.gender == 'F'])
count | percentage | avg_retweets | |
---|---|---|---|
index | |||
M | 25410 | 59.6% | 19.56 |
F | 17228 | 40.4% | 17.35 |
That is, retweets per female journalist.
female_journalists_retweet_df = journalists_retweet_df[journalists_retweet_df.gender == 'F']
female_journalists_retweet_by_gender_df = pd.merge(user_summary_df[user_summary_df.gender == 'F'], female_journalists_retweet_df.groupby(['user_id', 'retweet_gender']).size().unstack(), how='left', left_index=True, right_index=True)[['F', 'M']]
female_journalists_retweet_by_gender_df.fillna(0, inplace=True)
female_journalists_retweet_by_gender_df['all'] = female_journalists_retweet_by_gender_df.F + female_journalists_retweet_by_gender_df.M
female_journalists_retweet_by_gender_df.describe()
F | M | all | |
---|---|---|---|
count | 993.00 | 993.00 | 993.00 |
mean | 17.35 | 25.59 | 42.94 |
std | 45.34 | 74.55 | 113.79 |
min | 0.00 | 0.00 | 0.00 |
25% | 0.00 | 1.00 | 2.00 |
50% | 4.00 | 6.00 | 10.00 |
75% | 16.00 | 22.00 | 39.00 |
max | 857.00 | 1,779.00 | 2,385.00 |
journalists_retweeted_by_male_summary_df = journalist_retweet_summary(journalists_retweet_df[journalists_retweet_df.gender == 'M'])
journalists_retweeted_by_male_summary_df.to_csv('output/journalists_retweeted_by_male_journalists.csv')
journalists_retweeted_by_male_summary_df[journalist_retweet_summary_fields].head(25)
screen_name | name | organization | gender | followers_count | retweet_count | retweeting_count | |
---|---|---|---|---|---|---|---|
user_id | |||||||
21316253 | ZekeJMiller | Miller, Zeke J. | Time Magazine | M | 198517 | 1,207.00 | 238.00 |
19107878 | GlennThrush | Thrush, Glenn H. | New York Times | M | 308181 | 1,114.00 | 286.00 |
407013776 | burgessev | Everett, John B. | Politico | M | 31010 | 1,088.00 | 167.00 |
14529929 | jaketapper | Tapper, Jake | CNN | M | 1305680 | 1,071.00 | 239.00 |
13524182 | daveweigel | Weigel, David | Washington Post | M | 332344 | 975.00 | 209.00 |
39155029 | mkraju | Raju, Manu K. | CNN | M | 88366 | 956.00 | 209.00 |
46557945 | StevenTDennis | Dennis, Steven T. | Bloomberg News | M | 55762 | 900.00 | 183.00 |
398088661 | MEPFuller | Fuller, Matt E. | Huffington Post | M | 77919 | 877.00 | 170.00 |
19847765 | sahilkapur | Kapur, Sahil | Bloomberg News | M | 69086 | 848.00 | 193.00 |
16006592 | BenjySarlin | Sarlin, Benjamin | NBC News | M | 78075 | 828.00 | 141.00 |
19186003 | seungminkim | Kim, Seung Min | Politico | F | 33980 | 821.00 | 185.00 |
16187637 | ChadPergram | Pergram, Chad | Fox News | M | 59305 | 812.00 | 175.00 |
31127446 | markknoller | Knoller, Mark | CBS News | M | 301474 | 794.00 | 201.00 |
259395895 | JohnJHarwood | Harwood, John | CNBC | M | 149040 | 777.00 | 196.00 |
104914594 | Phil_Mattingly | Mattingly, Phil | CNN | M | 40119 | 748.00 | 185.00 |
14007532 | frankthorp | Thorp, Frank | NBC News | M | 39798 | 737.00 | 194.00 |
18678924 | jmartNYT | Martin, Jonathan | New York Times | M | 197322 | 726.00 | 167.00 |
21252618 | JakeSherman | Sherman, Jacob S. | Politico | M | 81762 | 641.00 | 175.00 |
104299137 | DavidMDrucker | Drucker, David | Washington Examiner | M | 35033 | 583.00 | 127.00 |
70511174 | Hadas_Gold | Gold, Hadas | Politico | F | 45221 | 570.00 | 195.00 |
12354832 | kasie | Hunt, Kasie | NBC News | F | 187357 | 565.00 | 224.00 |
22771961 | Acosta | Acosta, Jim | CNN | M | 350650 | 564.00 | 196.00 |
19580890 | LeeCamp | Camp, Lee | RTTV America | M | 67601 | 560.00 | 6.00 |
3817401 | ericgeller | Geller, Eric | Politico | M | 58173 | 524.00 | 149.00 |
22129280 | jimsciutto | Sciutto, James | CNN | M | 172012 | 507.00 | 151.00 |
Average is of male journalists retweeting other journalists, how many retweets does each male / female journalist receive.
journalist_retweet_gender_summary(journalists_retweet_df[journalists_retweet_df.gender == 'M'])
count | percentage | avg_retweets | |
---|---|---|---|
index | |||
M | 55224 | 74.2% | 42.51 |
F | 19186 | 25.8% | 19.32 |
That is, retweets per male journalist.
male_journalists_retweet_df = journalists_retweet_df[journalists_retweet_df.gender == 'M']
male_journalists_retweet_by_gender_df = pd.merge(user_summary_df[user_summary_df.gender == 'M'], male_journalists_retweet_df.groupby(['user_id', 'retweet_gender']).size().unstack(), how='left', left_index=True, right_index=True)[['F', 'M']]
male_journalists_retweet_by_gender_df.fillna(0, inplace=True)
male_journalists_retweet_by_gender_df['all'] = male_journalists_retweet_by_gender_df.F + male_journalists_retweet_by_gender_df.M
male_journalists_retweet_by_gender_df.describe()
F | M | all | |
---|---|---|---|
count | 1,299.00 | 1,299.00 | 1,299.00 |
mean | 14.77 | 42.51 | 57.28 |
std | 33.50 | 106.87 | 136.92 |
min | 0.00 | 0.00 | 0.00 |
25% | 0.00 | 1.00 | 1.00 |
50% | 3.00 | 7.00 | 11.00 |
75% | 14.00 | 35.00 | 50.00 |
max | 442.00 | 1,414.00 | 1,766.00 |