Beltway reporters

Setup

This creates some functions used to load the data.

In [1]:
import pandas as pd
import numpy as np
import json
from dateutil.parser import parse as date_parse
import gzip
import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Filepaths of the files to load.
filepaths = ['d59d27e2f2ed4778881573df2ecf2fad_001.json.gz',
            '25319652321b4bb498b250ffc53aa0f0_001.json.gz']

# Load tweets from gzipped, line-oriented JSON files, possibly transforming with provided function
# and limiting by number of tweets.
# Returns an iterator.
def tweet_iter(filepaths, limit=None, tweet_transform_func=None):
    for filepath in filepaths:
        with gzip.open(filepath) as file:
            for count, line in enumerate(file):
                if count % 50000 == 0:
                    logging.debug('Loaded %s', count)
                tweet = json.loads(line)
                if tweet_transform_func:
                    tweet_transform_ret = tweet_transform_func(tweet)
                    if isinstance(tweet_transform_ret, list):
                        for tweet in tweet_transform_ret:
                            yield tweet
                    elif tweet_transform_ret is not None:
                        yield tweet_transform_ret
                else:
                    yield tweet
                if count+1 == limit:
                    break

Find first tweet for each user

The goal is to determine a start date for limiting dataset.

Load the data and count.

In [2]:
# Simply the tweet on load
def tweet_type(tweet):
    if tweet.get('in_reply_to_status_id'):
        return 'reply'
    if 'retweeted_status' in tweet:
        return 'retweet'
    if 'quoted_status' in tweet:
        return 'quote'
    return 'original'

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'user_created_at': date_parse(tweet['user']['created_at']),
        'tweets_to_date': tweet['user']['statuses_count'],
        'tweet_type': tweet_type(tweet)
    }

tweet_df = pd.DataFrame(tweet_iter(filepaths, tweet_transform_func=tweet_transform), columns=['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'user_created_at', 'tweets_to_date', 'tweet_type'])
tweet_df.count()
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 1550000
DEBUG:root:Loaded 1600000
DEBUG:root:Loaded 1650000
DEBUG:root:Loaded 1700000
DEBUG:root:Loaded 1750000
DEBUG:root:Loaded 1800000
Out[2]:
tweet_id            3364440
user_id             3364440
screen_name         3364440
tweet_created_at    3364440
user_created_at     3364440
tweets_to_date      3364440
tweet_type          3364440
dtype: int64

View the top of the data.

In [3]:
tweet_df.head()
Out[3]:
tweet_id user_id screen_name tweet_created_at user_created_at tweets_to_date tweet_type
0 847821180832804864 1638925448 A_Childers_ 2017-03-31 14:41:35+00:00 2013-08-01 21:44:28+00:00 6071 retweet
1 847814632643473411 1638925448 A_Childers_ 2017-03-31 14:15:34+00:00 2013-08-01 21:44:28+00:00 6071 retweet
2 847627543142219776 1638925448 A_Childers_ 2017-03-31 01:52:09+00:00 2013-08-01 21:44:28+00:00 6071 reply
3 847597404719267841 1638925448 A_Childers_ 2017-03-30 23:52:23+00:00 2013-08-01 21:44:28+00:00 6071 reply
4 847593734896324608 1638925448 A_Childers_ 2017-03-30 23:37:48+00:00 2013-08-01 21:44:28+00:00 6071 reply

Remove duplicates

Dupes happen when collecting data from Twitter API.

In [4]:
len(tweet_df['tweet_id'].unique())
Out[4]:
3335489
In [5]:
dedupe_tweet_df = tweet_df.drop_duplicates(['tweet_id'], keep='last')
len(dedupe_tweet_df)
Out[5]:
3335489

Number of tweets in dataset for each user

In [6]:
tweet_count_df = pd.DataFrame(dedupe_tweet_df['user_id'].value_counts()).rename(columns={'user_id': 'tweets_in_dataset'})
tweet_count_df.index.name = 'user_id'
tweet_count_df.count()
Out[6]:
tweets_in_dataset    1443
dtype: int64
In [7]:
tweet_count_df.head()
Out[7]:
tweets_in_dataset
user_id
3817401 5286
22891564 4321
456994513 4273
593813785 4110
15146659 3945
In [8]:
# Get the first tweet for each user
first_tweet_df = dedupe_tweet_df.loc[dedupe_tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id'])
first_tweet_df.count()
Out[8]:
tweet_id            1443
screen_name         1443
tweet_created_at    1443
user_created_at     1443
tweets_to_date      1443
tweet_type          1443
dtype: int64
In [9]:
first_tweet_df.head()
Out[9]:
tweet_id screen_name tweet_created_at user_created_at tweets_to_date tweet_type
user_id
100165378 619906732052074496 ChristineSisto 2015-07-11 16:30:56+00:00 2009-12-29 07:27:27+00:00 8646 retweet
1001991865 289090058148012033 FredSchulte 2013-01-09 19:23:35+00:00 2012-12-10 16:16:10+00:00 888 reply
1002229862 425802092465623040 HMRothmandc 2014-01-22 01:28:24+00:00 2012-12-10 18:37:13+00:00 1777 reply
100270054 740945974143635464 Laubarth 2016-06-09 16:37:41+00:00 2009-12-29 17:02:01+00:00 6 original
100802089 7240989598 ayesharascoe 2009-12-31 17:27:25+00:00 2009-12-31 16:48:11+00:00 491 original
In [10]:
# Merge with number of tweets in dataset for each user
first_tweet_merge_df = first_tweet_df.join(tweet_count_df).drop(['tweet_id', 'tweet_type'], axis=1)
first_tweet_merge_df.count()
Out[10]:
screen_name          1443
tweet_created_at     1443
user_created_at      1443
tweets_to_date       1443
tweets_in_dataset    1443
dtype: int64

First tweet for each user <----------

For each user, the date of the first tweet in the dataset, the date the account was created, the number of tweets to date (roughly), and the tweets in the dataset.

If the user_created_at and tweet_created_at are close, then this is probably a new account. If the user_created_at and tweet_created_at are not close, but there is a small number of tweets then this user probably started tweeting recently (like a new account). If the user_created_at and tweet_created_at are not close and there is a large number of tweets then this is probably a prolific tweeter. Note that not all tweets for this user were probably collected.

In [11]:
first_tweet_merge_df.sort_values('tweet_created_at', ascending=False).head(20)
Out[11]:
screen_name tweet_created_at user_created_at tweets_to_date tweets_in_dataset
user_id
76696176 sklee_ca 2017-03-31 17:07:58+00:00 2009-09-23 17:09:53+00:00 2 2
66768858 emmaroller 2017-03-27 13:25:07+00:00 2009-08-18 19:10:55+00:00 223 210
842787331224584192 RebeccaEHoffman 2017-03-17 17:27:58+00:00 2017-03-17 17:18:52+00:00 9 9
831972200014045191 ErinMcManus15 2017-03-15 14:16:31+00:00 2017-02-15 21:03:24+00:00 1 1
20281013 EvanMcS 2017-03-10 16:33:43+00:00 2009-02-06 23:09:59+00:00 146 196
18825339 CahnEmily 2017-03-10 13:43:41+00:00 2009-01-10 03:19:50+00:00 86548 3205
30176025 LaurenFCarroll 2017-03-08 20:33:21+00:00 2009-04-10 06:29:32+00:00 34 49
3817401 ericgeller 2017-03-07 16:38:59+00:00 2007-04-08 20:27:11+00:00 186181 5286
21612122 HotlineJosh 2017-03-03 22:00:52+00:00 2009-02-22 23:45:46+00:00 143393 3227
22891564 chrisgeidner 2017-03-02 16:35:58+00:00 2009-03-05 06:48:00+00:00 193071 4321
836643030161625089 BeddingfieldMJ 2017-02-28 19:00:31+00:00 2017-02-28 18:23:37+00:00 5 5
103016675 AaronMehta 2017-02-16 19:05:49+00:00 2010-01-08 15:53:25+00:00 37425 3242
275207082 AlexParkerDC 2017-02-15 14:48:35+00:00 2011-03-31 20:53:10+00:00 135230 3206
456994513 maria_e_recio 2017-02-12 15:54:14+00:00 2012-01-06 22:22:40+00:00 31774 4273
13524182 daveweigel 2017-02-09 21:56:58+00:00 2008-02-15 17:58:23+00:00 160812 3846
827605131109793792 OSHAReporter 2017-02-08 21:50:54+00:00 2017-02-03 19:50:14+00:00 99 114
21810329 sdonnan 2017-02-08 13:44:00+00:00 2009-02-24 23:10:17+00:00 69845 3780
21696279 brianbeutler 2017-02-03 20:46:05+00:00 2009-02-23 21:31:16+00:00 90303 3765
15146659 JSwiftTWS 2017-02-03 01:33:02+00:00 2008-06-17 15:19:03+00:00 77924 3945
16459325 ryanbeckwith 2017-02-02 15:57:24+00:00 2008-09-25 22:43:36+00:00 83228 3223

Types of tweets <----------

In [12]:
dedupe_tweet_df['tweet_type'].value_counts()
Out[12]:
original    1593541
retweet     1094028
reply        396287
quote        251633
Name: tweet_type, dtype: int64

Top mentions

Determine who is being mentioned and attempt to characterize. Retweets and quotes are omitted.

In [13]:
# Simply the tweet on load
def mention_transform(tweet):
    mentions = []
    if 'retweeted_status' not in tweet and 'quoted_status' not in tweet:
        for mention in tweet.get('entities', {}).get('user_mentions', []):
            mentions.append({
                'tweet_id': tweet['id_str'],
                'user_id': tweet['user']['id_str'],
                'screen_name': tweet['user']['screen_name'],
                'mention_user_id': mention['id_str'],
                'mention_screen_name': mention['screen_name'],
                'tweet_created_at': date_parse(tweet['created_at'])
            })
    return mentions

mention_df = pd.DataFrame(tweet_iter(filepaths, tweet_transform_func=mention_transform))
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 1550000
DEBUG:root:Loaded 1600000
DEBUG:root:Loaded 1650000
DEBUG:root:Loaded 1700000
DEBUG:root:Loaded 1750000
DEBUG:root:Loaded 1800000

Number of mentions found in the dataset

In [14]:
mention_df.count()
Out[14]:
mention_screen_name    1363129
mention_user_id        1363129
screen_name            1363129
tweet_created_at       1363129
tweet_id               1363129
user_id                1363129
dtype: int64

The mention data

Each mention consists of the tweet id, the screen name and user id that is mentioned, and the screen_name and user_id that is mentioning.

In [15]:
mention_df.head()
Out[15]:
mention_screen_name mention_user_id screen_name tweet_created_at tweet_id user_id
0 davidbschultz 53739928 A_Childers_ 2017-03-31 01:52:09+00:00 847627543142219776 1638925448
1 davidbschultz 53739928 A_Childers_ 2017-03-30 23:52:23+00:00 847597404719267841 1638925448
2 AriPeskoe 499013898 A_Childers_ 2017-03-30 23:37:48+00:00 847593734896324608 1638925448
3 deantscott 134918286 A_Childers_ 2017-03-30 23:37:48+00:00 847593734896324608 1638925448
4 Pat_Ambrosio 2497185313 A_Childers_ 2017-03-30 19:41:27+00:00 847534254355599364 1638925448

Remove duplicates

In [16]:
dedupe_mention_df = mention_df.drop_duplicates()
dedupe_mention_df.count()
Out[16]:
mention_screen_name    1348153
mention_user_id        1348153
screen_name            1348153
tweet_created_at       1348153
tweet_id               1348153
user_id                1348153
dtype: int64
In [17]:
# From the mentions, extract map of user ids to screen names
user_id_lookup_df = mention_df.loc[mention_df.groupby('mention_user_id')['tweet_created_at'].idxmax()].ix[:,['mention_user_id', 'mention_screen_name']].set_index(['mention_user_id'])
user_id_lookup_df.count()
Out[17]:
mention_screen_name    137344
dtype: int64
In [18]:
user_id_lookup_df.head()
Out[18]:
mention_screen_name
mention_user_id
1000010898 RoyScranton
100002112 whyyradiotimes
1000030188 jessieb747
100003141 NCCDtweets
100004577 Orange_France
In [19]:
# Group by user_id
# This count should match the user_id map count
mention_summary_user_id_df = pd.DataFrame(dedupe_mention_df.groupby('mention_user_id').size(), columns=['mention_count'])
mention_summary_user_id_df.count()
Out[19]:
mention_count    137344
dtype: int64
In [20]:
mention_summary_user_id_df.head()
Out[20]:
mention_count
mention_user_id
1000010898 20
100002112 15
1000030188 10
100003141 2
100004577 1
In [21]:
# Join with user id map
mention_summary_df = mention_summary_user_id_df.join(user_id_lookup_df)
mention_summary_df.count()
Out[21]:
mention_count          137344
mention_screen_name    137344
dtype: int64
In [22]:
mention_summary_df.head()
Out[22]:
mention_count mention_screen_name
mention_user_id
1000010898 20 RoyScranton
100002112 15 whyyradiotimes
1000030188 10 jessieb747
100003141 2 NCCDtweets
100004577 1 Orange_France

Load known Twitter accounts

In [23]:
def seed_iter(filepath):
    with open(filepath) as file:
        for line in file:
            screen_name, user_id = line.split(',')
            yield {'screen_name': screen_name, 'user_id': user_id[:-1]}

def load_seed_df(filepath, seed_type):
    df = pd.DataFrame(seed_iter(filepath))
    df['screen_name_lower'] = df.screen_name.apply(str.lower)
    df['type'] = seed_type
    return df
In [24]:
federal_agencies_df = load_seed_df('federal_agencies.csv', 'government')
federal_agencies_df.count()
Out[24]:
screen_name          2968
user_id              2968
screen_name_lower    2968
type                 2968
dtype: int64
In [25]:
news_outlets_df = load_seed_df('news_outlets.csv', 'media')
news_outlets_df.count()
Out[25]:
screen_name          92
user_id              92
screen_name_lower    92
type                 92
dtype: int64
In [26]:
newspaper_reporters_df = load_seed_df('newspaper_reporters.csv', 'reporters')
newspaper_reporters_df.count()
Out[26]:
screen_name          790
user_id              790
screen_name_lower    790
type                 790
dtype: int64
In [27]:
periodical_reporters_df = load_seed_df('periodical_reporters.csv', 'reporters')
periodical_reporters_df.count()
Out[27]:
screen_name          677
user_id              677
screen_name_lower    677
type                 677
dtype: int64
In [28]:
administration_officials_df = load_seed_df('administration_officials.csv', 'politicians')
administration_officials_df.count()
Out[28]:
screen_name          63
user_id              63
screen_name_lower    63
type                 63
dtype: int64
In [29]:
cabinet_df = load_seed_df('cabinet.csv', 'politicians')
cabinet_df.count()
Out[29]:
screen_name          12
user_id              12
screen_name_lower    12
type                 12
dtype: int64
In [30]:
representatives_df = load_seed_df('representatives.csv', 'politicians')
representatives_df.count()
Out[30]:
screen_name          431
user_id              431
screen_name_lower    431
type                 431
dtype: int64
In [31]:
senators_df = load_seed_df('senators.csv', 'politicians')
senators_df.count()
Out[31]:
screen_name          100
user_id              100
screen_name_lower    100
type                 100
dtype: int64
In [32]:
media_df = load_seed_df('media.csv', 'media')
media_df.count()
Out[32]:
screen_name          5997
user_id              5997
screen_name_lower    5997
type                 5997
dtype: int64
In [33]:
# Order is deliberate here, since will be deduplicating.
screen_name_lookup_df = newspaper_reporters_df.append([administration_officials_df,
                                      news_outlets_df,
                                      periodical_reporters_df,
                                      cabinet_df,
                                      representatives_df,
                                      senators_df,
                                      media_df,
                                      federal_agencies_df], ignore_index=True).drop_duplicates(subset='screen_name_lower').set_index(['user_id'])
screen_name_lookup_df.count()
Out[33]:
screen_name          10932
screen_name_lower    10932
type                 10932
dtype: int64
In [34]:
screen_name_lookup_df.head()
Out[34]:
screen_name screen_name_lower type
user_id
2345626885 marcy_crane marcy_crane reporters
780221130 loren_duggan loren_duggan reporters
285772181 akesslerdc akesslerdc reporters
29607664 adamliptak adamliptak reporters
9484732 amacker amacker reporters

Join the mentions and known Twitter accounts

In [35]:
mention_join_df = mention_summary_df.join(screen_name_lookup_df, how='left')
mention_join_df['type'].fillna('unknown', inplace=True)
mention_join_df.index.name = 'user_id'
mention_join_df.head()
Out[35]:
mention_count mention_screen_name screen_name screen_name_lower type
user_id
1000010898 20 RoyScranton NaN NaN unknown
100002112 15 whyyradiotimes NaN NaN unknown
1000030188 10 jessieb747 NaN NaN unknown
100003141 2 NCCDtweets NaN NaN unknown
100004577 1 Orange_France NaN NaN unknown

Top (by mention count) accounts that are matched against known Twitter accounts <----------

In [36]:
top_known_mentions_df = mention_join_df[pd.notnull(mention_join_df.screen_name)].sort_values('mention_count', ascending=False)
top_known_mentions_df[['mention_screen_name', 'mention_count', 'type']].head(20)
Out[36]:
mention_screen_name mention_count type
user_id
25073877 realDonaldTrump 19057 politicians
51241574 AP 15077 media
3108351 WSJ 12550 media
15754281 USATODAY 11999 media
822215679726100480 POTUS 9872 politicians
1652541 Reuters 9158 media
15922214 rollcall 7175 media
9300262 politico 7113 media
807095 nytimes 6335 media
818927131883356161 PressSec 5849 politicians
14662354 WashTimes 5767 media
2467791 washingtonpost 5644 media
18916432 SpeakerRyan 3130 politicians
14615871 EPA 2863 government
95431448 BostonGlobe 2649 media
14857525 csmonitor 2279 media
103016675 AaronMehta 1964 reporters
398088661 MEPFuller 1889 reporters
759251 CNN 1885 media
818910970567344128 VP 1884 politicians

Number of matched accounts <----------

mention_screen_name is the number of unique mentioned accounts. screen_name is the number of matched unique accounts.

In [37]:
mention_join_df.count()
Out[37]:
mention_count          137344
mention_screen_name    137344
screen_name              3721
screen_name_lower        3721
type                   137344
dtype: int64

Top accounts by mentions <----------

Unknown for type indicates that it is not matched with an known Twitter account.

In [38]:
top_mentions_df = mention_join_df.sort_values('mention_count', ascending=False)
top_mentions_df[['mention_screen_name', 'mention_count', 'type']].head(50)
Out[38]:
mention_screen_name mention_count type
user_id
25073877 realDonaldTrump 19057 politicians
51241574 AP 15077 media
3108351 WSJ 12550 media
15754281 USATODAY 11999 media
2312829909 CQnow 11286 unknown
1339835893 HillaryClinton 10526 unknown
822215679726100480 POTUS 9872 politicians
1652541 Reuters 9158 media
34713362 business 7976 unknown
15147042 educationweek 7514 unknown
15922214 rollcall 7175 media
9300262 politico 7113 media
459277523 BloombergBNA 6710 unknown
807095 nytimes 6335 media
18956073 dcexaminer 6253 unknown
818927131883356161 PressSec 5849 politicians
14662354 WashTimes 5767 media
2467791 washingtonpost 5644 media
564111558 bpolitics 3614 unknown
216776631 BernieSanders 3313 unknown
185817496 FERNnews 3197 unknown
18916432 SpeakerRyan 3130 politicians
14615871 EPA 2863 government
23022687 tedcruz 2687 unknown
95431448 BostonGlobe 2649 media
27741349 RNS 2494 unknown
813286 BarackObama 2388 unknown
34613951 BloombergLaw 2376 unknown
17197344 Nextgov 2324 unknown
16311797 WSJPolitics 2296 unknown
14857525 csmonitor 2279 media
790293275630592002 EEPublishing 2153 unknown
19918986 WashBlade 2147 unknown
15745368 marcorubio 2054 unknown
18949452 FT 2040 unknown
14692385 Militarydotcom 2017 unknown
15164565 Slate 1987 unknown
103016675 AaronMehta 1964 reporters
398088661 MEPFuller 1889 reporters
759251 CNN 1885 media
818910970567344128 VP 1884 politicians
15210284 nationaljournal 1731 unknown
207660339 POLITICOPro 1639 unknown
19034656 Publici 1591 unknown
14597239 TonyRomm 1589 reporters
39308549 DailyCaller 1532 unknown
227682918 BV 1513 unknown
3817401 ericgeller 1505 reporters
140286364 nielslesniewski 1498 reporters
8953122 PolitiFact 1481 unknown

Mentions by account type <----------

In [39]:
mention_join_df.groupby('type').sum()
Out[39]:
mention_count
type
government 33127
media 121848
politicians 94800
reporters 174725
unknown 923653

Top (by mentions) accounts that are not known. <----------

These are the accounts that we will want to categorize.

In [40]:
top_not_known_mention_df = mention_join_df[mention_join_df.type == 'unknown'].sort_values('mention_count', ascending=False)
top_not_known_mention_df[['mention_screen_name', 'mention_count']].head(100)
Out[40]:
mention_screen_name mention_count
user_id
2312829909 CQnow 11286
1339835893 HillaryClinton 10526
34713362 business 7976
15147042 educationweek 7514
459277523 BloombergBNA 6710
18956073 dcexaminer 6253
564111558 bpolitics 3614
216776631 BernieSanders 3313
185817496 FERNnews 3197
23022687 tedcruz 2687
27741349 RNS 2494
813286 BarackObama 2388
34613951 BloombergLaw 2376
17197344 Nextgov 2324
16311797 WSJPolitics 2296
790293275630592002 EEPublishing 2153
19918986 WashBlade 2147
15745368 marcorubio 2054
18949452 FT 2040
14692385 Militarydotcom 2017
15164565 Slate 1987
15210284 nationaljournal 1731
207660339 POLITICOPro 1639
19034656 Publici 1591
39308549 DailyCaller 1532
227682918 BV 1513
8953122 PolitiFact 1481
63781564 newsfromIN 1452
16789970 SenatorReid 1430
93069110 maggieNYT 1417
... ... ...
1020058453 BuzzFeedNews 771
15723290 TIMEPolitics 767
14173315 NBCNews 759
149954544 MorningConsult 754
86129724 costareports 753
16334857 WSJecon 751
17066782 GovExec 741
1347285918 ChrisChristie 720
15862891 NatureNews 691
69620713 markets 691
135575282 morningmoneyben 686
16244449 jbarro 685
114756202 HouseCommerce 672
18772897 reidepstein 669
15438913 MailOnline 665
14677751 mateagold 663
16405372 PoliticsK12 663
16184358 CNNMoney 653
1433356862 DefenseOne 651
17487795 McClatchyDC 650
17243582 blakehounshell 640
10228272 YouTube 632
14268812 ron_fournier 617
370113964 sangerkatz 606
21315320 unsuckdcmetro 596
15893354 wpjenna 587
16012783 thedailybeast 584
51462013 lizzieohreally 581
50325797 chucktodd 571
1330457336 billclinton 571

100 rows × 2 columns

Top retweets and quotes

Determine who is being retweeted and quoted.

In [41]:
# Simply the tweet on load
def retweet_transform(tweet):
    retweet = tweet.get('retweeted_status') or tweet.get('quoted_status')
    if retweet:
        return {
            'tweet_id': tweet['id_str'],
            'user_id': tweet['user']['id_str'],
            'screen_name': tweet['user']['screen_name'],
            'retweet_user_id': retweet['user']['id_str'],
            'retweet_screen_name': retweet['user']['screen_name'],
            'tweet_created_at': date_parse(tweet['created_at'])            
        }
    return None

retweet_df = pd.DataFrame(tweet_iter(filepaths, tweet_transform_func=retweet_transform))
retweet_df.count()
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 1550000
DEBUG:root:Loaded 1600000
DEBUG:root:Loaded 1650000
DEBUG:root:Loaded 1700000
DEBUG:root:Loaded 1750000
DEBUG:root:Loaded 1800000
Out[41]:
retweet_screen_name    1361711
retweet_user_id        1361711
screen_name            1361711
tweet_created_at       1361711
tweet_id               1361711
user_id                1361711
dtype: int64
In [42]:
retweet_df.head()
Out[42]:
retweet_screen_name retweet_user_id screen_name tweet_created_at tweet_id user_id
0 paulconndc 64502388 A_Childers_ 2017-03-31 14:41:35+00:00 847821180832804864 1638925448
1 azevin 14744078 A_Childers_ 2017-03-31 14:15:34+00:00 847814632643473411 1638925448
2 TiffanyStecker 17679229 A_Childers_ 2017-03-30 17:47:04+00:00 847505467995693057 1638925448
3 Calvinn_Hobbes 1579422614 A_Childers_ 2017-03-30 13:50:02+00:00 847445818072317952 1638925448
4 business 34713362 A_Childers_ 2017-03-30 13:17:17+00:00 847437576856330241 1638925448

Remove duplicates

In [43]:
dedupe_retweet_df = retweet_df.drop_duplicates()
dedupe_retweet_df.count()
Out[43]:
retweet_screen_name    1348290
retweet_user_id        1348290
screen_name            1348290
tweet_created_at       1348290
tweet_id               1348290
user_id                1348290
dtype: int64
In [44]:
# From the retweets, extract map of user ids to screen names
retweet_user_id_lookup_df = dedupe_retweet_df.loc[dedupe_retweet_df.groupby('retweet_user_id')['tweet_created_at'].idxmax()].ix[:,['retweet_user_id', 'retweet_screen_name']].set_index(['retweet_user_id'])
retweet_user_id_lookup_df.count()
Out[44]:
retweet_screen_name    108098
dtype: int64
In [45]:
retweet_user_id_lookup_df.head()
Out[45]:
retweet_screen_name
retweet_user_id
1000010898 RoyScranton
100002112 whyyradiotimes
100003141 NCCDtweets
100005598 hotelkeys
100007369 signixsolutions
In [46]:
# Group by user_id
# This count should match the retweet_user_id map count
retweet_summary_user_id_df = pd.DataFrame(dedupe_retweet_df.groupby('retweet_user_id').size(), columns=['retweet_count'])
retweet_summary_user_id_df.count()
Out[46]:
retweet_count    108098
dtype: int64
In [47]:
retweet_summary_user_id_df.head()
Out[47]:
retweet_count
retweet_user_id
1000010898 2
100002112 37
100003141 5
100005598 9
100007369 1
In [48]:
# Join with user id map
retweet_summary_df = retweet_summary_user_id_df.join(retweet_user_id_lookup_df)
retweet_summary_df.count()
Out[48]:
retweet_count          108098
retweet_screen_name    108098
dtype: int64
In [49]:
retweet_summary_df.head()
Out[49]:
retweet_count retweet_screen_name
retweet_user_id
1000010898 2 RoyScranton
100002112 37 whyyradiotimes
100003141 5 NCCDtweets
100005598 9 hotelkeys
100007369 1 signixsolutions

Join the retweets and known Twitter accounts

In [50]:
retweet_join_df = retweet_summary_df.join(screen_name_lookup_df, how='left')
retweet_join_df['type'].fillna('unknown', inplace=True)
retweet_join_df.index.name = 'user_id'
retweet_join_df.head()
Out[50]:
retweet_count retweet_screen_name screen_name screen_name_lower type
user_id
1000010898 2 RoyScranton NaN NaN unknown
100002112 37 whyyradiotimes NaN NaN unknown
100003141 5 NCCDtweets NaN NaN unknown
100005598 9 hotelkeys NaN NaN unknown
100007369 1 signixsolutions NaN NaN unknown
### Top (by retweet count) accounts that are matched against known Twitter accounts <----------
In [51]:
top_known_retweets_df = retweet_join_df[pd.notnull(retweet_join_df.screen_name)].sort_values('retweet_count', ascending=False)
top_known_retweets_df[['retweet_screen_name', 'retweet_count', 'type']].head(20)
Out[51]:
retweet_screen_name retweet_count type
user_id
51241574 AP 8998 media
426802833 AP_Politics 8436 media
25073877 realDonaldTrump 8320 politicians
1917731 thehill 6307 media
9300262 politico 6024 media
2467791 washingtonpost 5552 media
15922214 rollcall 4769 media
807095 nytimes 4062 media
21316253 ZekeJMiller 3860 reporters
13524182 daveweigel 3437 reporters
1652541 Reuters 3025 media
46557945 StevenTDennis 2950 reporters
19186003 seungminkim 2743 reporters
906734342 KimberlyRobinsn 2719 reporters
407013776 burgessev 2700 reporters
3108351 WSJ 2649 media
398088661 MEPFuller 2636 reporters
23232204 ShaneGoldmacher 2624 reporters
217550862 BresPolitico 2532 reporters
140286364 nielslesniewski 2486 reporters

Number of matched accounts <----------

retweet_screen_name is the number of unique mentioned accounts. screen_name is the number of matched unique accounts.

In [52]:
retweet_join_df.count()
Out[52]:
retweet_count          108098
retweet_screen_name    108098
screen_name              3542
screen_name_lower        3542
type                   108098
dtype: int64

Top accounts by retweets <----------

Unknown for type indicates that it is not matched with an known Twitter account.

In [53]:
top_retweets_df = retweet_join_df.sort_values('retweet_count', ascending=False)
top_retweets_df[['retweet_screen_name', 'retweet_count', 'type']].head(50)
Out[53]:
retweet_screen_name retweet_count type
user_id
51241574 AP 8998 media
426802833 AP_Politics 8436 media
25073877 realDonaldTrump 8320 politicians
90614279 EENewsUpdates 6548 unknown
2312829909 CQnow 6495 unknown
1917731 thehill 6307 media
9300262 politico 6024 media
93069110 maggieNYT 5751 unknown
2467791 washingtonpost 5552 media
15922214 rollcall 4769 media
34713362 business 4105 unknown
807095 nytimes 4062 media
21316253 ZekeJMiller 3860 reporters
13524182 daveweigel 3437 reporters
299802277 BraddJaffy 3056 unknown
1652541 Reuters 3025 media
46557945 StevenTDennis 2950 reporters
59331128 PhilipRucker 2843 unknown
14529929 jaketapper 2807 unknown
19186003 seungminkim 2743 reporters
207660339 POLITICOPro 2729 unknown
906734342 KimberlyRobinsn 2719 reporters
407013776 burgessev 2700 reporters
3108351 WSJ 2649 media
398088661 MEPFuller 2636 reporters
23232204 ShaneGoldmacher 2624 reporters
14412533 CillizzaCNN 2567 unknown
217550862 BresPolitico 2532 reporters
17243582 blakehounshell 2509 unknown
140286364 nielslesniewski 2486 reporters
48120914 SopanDeb 2470 unknown
21252618 JakeSherman 2462 reporters
19914257 mviser 2416 reporters
18678924 jmartNYT 2338 reporters
39155029 mkraju 2308 reporters
19847765 sahilkapur 2278 reporters
34613951 BloombergLaw 2268 unknown
15147042 educationweek 2206 unknown
15463671 samsteinhp 2196 reporters
149954544 MorningConsult 2154 unknown
16187637 ChadPergram 2135 unknown
15754281 USATODAY 2091 media
18956073 dcexaminer 2056 unknown
104914594 Phil_Mattingly 2051 unknown
31127446 markknoller 2045 unknown
380648579 AFP 2015 media
19107878 GlennThrush 2010 reporters
15433452 JenniferJJacobs 1979 unknown
86129724 costareports 1920 unknown
16930125 edatpost 1916 reporters

Retweets by account type <----------

In [54]:
retweet_join_df.groupby('type').sum()
Out[54]:
retweet_count
type
government 10892
media 84886
politicians 24630
reporters 299580
unknown 928302

Top (by retweets) accounts that are not known. <----------

These are the accounts that we will want to categorize.

In [55]:
top_not_known_retweets_df = retweet_join_df[retweet_join_df.type == 'unknown'].sort_values('retweet_count', ascending=False)
top_not_known_retweets_df[['retweet_screen_name', 'retweet_count']].head(100)
Out[55]:
retweet_screen_name retweet_count
user_id
90614279 EENewsUpdates 6548
2312829909 CQnow 6495
93069110 maggieNYT 5751
34713362 business 4105
299802277 BraddJaffy 3056
59331128 PhilipRucker 2843
14529929 jaketapper 2807
207660339 POLITICOPro 2729
14412533 CillizzaCNN 2567
17243582 blakehounshell 2509
48120914 SopanDeb 2470
34613951 BloombergLaw 2268
15147042 educationweek 2206
149954544 MorningConsult 2154
16187637 ChadPergram 2135
18956073 dcexaminer 2056
104914594 Phil_Mattingly 2051
31127446 markknoller 2045
15433452 JenniferJJacobs 1979
86129724 costareports 1920
14515799 brianstelter 1862
326255267 KFILE 1792
259395895 JohnJHarwood 1792
16405372 PoliticsK12 1677
564111558 bpolitics 1597
32871086 kylegriffin1 1547
18772897 reidepstein 1532
16868756 ddiamond 1513
218325695 Bencjacobs 1483
16311797 WSJPolitics 1468
... ... ...
235196204 GlennKesslerWP 974
26133429 Alex_Roarty 973
18686907 davidfrum 970
26792275 ForeignPolicy 969
126722715 abbydphillip 966
22129280 jimsciutto 951
2800581040 BillKristol 948
8953122 PolitiFact 943
15675138 cspan 939
47455112 lrozen 925
21431618 mckaycoppins 925
14606079 ProPublica 915
22278303 charlesornstein 914
16017475 NateSilver538 912
950531 pbump 906
3066084185 APBusiness 898
14173315 NBCNews 878
38936142 jdawsey1 875
4207961 chrislhayes 874
370113964 sangerkatz 872
17509945 medpagetoday 865
463765807 Nate_Cohn 854
142721190 elisefoley 847
15210284 nationaljournal 837
22032260 PlattsOil 827
19034656 Publici 812
47448886 JeffreyGoldberg 807
2149973089 APDiploWriter 804
2347049341 voxdotcom 800
459277523 BloombergBNA 795

100 rows × 2 columns

Top replies

Determine who is being replied to.

In [56]:
# Simply the tweet on load
def reply_transform(tweet):
    if tweet.get('in_reply_to_status_id'):
        return {
            'tweet_id': tweet['id_str'],
            'user_id': tweet['user']['id_str'],
            'screen_name': tweet['user']['screen_name'],
            'reply_to_user_id': tweet['in_reply_to_user_id_str'],
            'reply_to_screen_name': tweet['in_reply_to_screen_name'],
            'reply_to_tweet_id': tweet['in_reply_to_status_id_str'],
            'tweet_created_at': date_parse(tweet['created_at'])            
        }
    return None

reply_df = pd.DataFrame(tweet_iter(filepaths, tweet_transform_func=reply_transform))
reply_df.count()
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 1550000
DEBUG:root:Loaded 1600000
DEBUG:root:Loaded 1650000
DEBUG:root:Loaded 1700000
DEBUG:root:Loaded 1750000
DEBUG:root:Loaded 1800000
Out[56]:
reply_to_screen_name    398593
reply_to_tweet_id       398593
reply_to_user_id        398593
screen_name             398593
tweet_created_at        398593
tweet_id                398593
user_id                 398593
dtype: int64
In [57]:
reply_df.head()
Out[57]:
reply_to_screen_name reply_to_tweet_id reply_to_user_id screen_name tweet_created_at tweet_id user_id
0 davidbschultz 847622348777771008 53739928 A_Childers_ 2017-03-31 01:52:09+00:00 847627543142219776 1638925448
1 davidbschultz 847587744830427137 53739928 A_Childers_ 2017-03-30 23:52:23+00:00 847597404719267841 1638925448
2 AriPeskoe 847575250598494209 499013898 A_Childers_ 2017-03-30 23:37:48+00:00 847593734896324608 1638925448
3 Pat_Ambrosio 847533984833777664 2497185313 A_Childers_ 2017-03-30 19:41:27+00:00 847534254355599364 1638925448
4 ellisromance 847190236174176256 533335518 A_Childers_ 2017-03-29 20:57:37+00:00 847191036527067136 1638925448

Remove duplicates

In [58]:
dedupe_reply_df = reply_df.drop_duplicates()
dedupe_reply_df.count()
Out[58]:
reply_to_screen_name    396296
reply_to_tweet_id       396296
reply_to_user_id        396296
screen_name             396296
tweet_created_at        396296
tweet_id                396296
user_id                 396296
dtype: int64
In [59]:
# From the replies, extract map of user ids to screen names
reply_user_id_lookup_df = dedupe_reply_df.loc[dedupe_reply_df.groupby('reply_to_user_id')['tweet_created_at'].idxmax()].ix[:,['reply_to_user_id', 'reply_to_screen_name']].set_index(['reply_to_user_id'])
reply_user_id_lookup_df.count()
Out[59]:
reply_to_screen_name    74638
dtype: int64
In [60]:
retweet_user_id_lookup_df.head()
Out[60]:
retweet_screen_name
retweet_user_id
1000010898 RoyScranton
100002112 whyyradiotimes
100003141 NCCDtweets
100005598 hotelkeys
100007369 signixsolutions
In [61]:
# Group by user_id
# This count should match the reply_user_id map count
reply_summary_user_id_df = pd.DataFrame(dedupe_reply_df.groupby('reply_to_user_id').size(), columns=['reply_count'])
reply_summary_user_id_df.count()
Out[61]:
reply_count    74638
dtype: int64
In [62]:
reply_summary_user_id_df.head()
Out[62]:
reply_count
reply_to_user_id
1000010898 5
1000030188 8
100003141 2
100005598 18
1000228238 3
In [63]:
# Join with user id map
reply_summary_df = reply_summary_user_id_df.join(reply_user_id_lookup_df)
reply_summary_df.count()
Out[63]:
reply_count             74638
reply_to_screen_name    74638
dtype: int64
In [64]:
reply_summary_df.head()
Out[64]:
reply_count reply_to_screen_name
reply_to_user_id
1000010898 5 RoyScranton
1000030188 8 jessieb747
100003141 2 NCCDtweets
100005598 18 hotelkeys
1000228238 3 adwooldridge

Join the replies and known Twitter accounts

In [65]:
reply_join_df = reply_summary_df.join(screen_name_lookup_df, how='left')
reply_join_df['type'].fillna('unknown', inplace=True)
reply_join_df.index.name = 'user_id'
reply_join_df.head()
Out[65]:
reply_count reply_to_screen_name screen_name screen_name_lower type
user_id
1000010898 5 RoyScranton NaN NaN unknown
1000030188 8 jessieb747 NaN NaN unknown
100003141 2 NCCDtweets NaN NaN unknown
100005598 18 hotelkeys NaN NaN unknown
1000228238 3 adwooldridge NaN NaN unknown

Top (by reply count) accounts that are matched against known Twitter accounts <----------

In [66]:
top_known_reply_df = reply_join_df[pd.notnull(reply_join_df.screen_name)].sort_values('reply_count', ascending=False)
top_known_reply_df[['reply_to_screen_name', 'reply_count', 'type']].head(20)
Out[66]:
reply_to_screen_name reply_count type
user_id
3817401 ericgeller 1760 reporters
22891564 chrisgeidner 1652 reporters
398088661 MEPFuller 1533 reporters
906734342 KimberlyRobinsn 1525 reporters
118130765 dylanlscott 1501 reporters
103016675 AaronMehta 1278 reporters
46557945 StevenTDennis 1209 reporters
14597239 TonyRomm 1180 reporters
47758416 marissaaevans 1167 reporters
123738314 greggiroux 1167 reporters
19847765 sahilkapur 1035 reporters
16125224 ByronTau 1014 reporters
11771512 OKnox 900 reporters
46555511 Alex_Panetta 899 reporters
275207082 AlexParkerDC 838 reporters
26559241 fordm 823 reporters
29771100 lawrencehurley 803 reporters
225265639 ddale8 760 reporters
23332846 mattzap 738 reporters
16285830 philewing 733 reporters

Number of matched accounts <----------

reply_screen_name is the number of unique mentioned accounts. screen_name is the number of matched unique accounts.

In [67]:
reply_join_df.count()
Out[67]:
reply_count             74638
reply_to_screen_name    74638
screen_name              1763
screen_name_lower        1763
type                    74638
dtype: int64

Top accounts by replies <----------

Unknown for type indicates that it is not matched with an known Twitter account.

In [68]:
top_replies_df = reply_join_df.sort_values('reply_count', ascending=False)
top_replies_df[['reply_to_screen_name', 'reply_count', 'type']].head(50)
Out[68]:
reply_to_screen_name reply_count type
user_id
3817401 ericgeller 1760 reporters
22891564 chrisgeidner 1652 reporters
398088661 MEPFuller 1533 reporters
906734342 KimberlyRobinsn 1525 reporters
118130765 dylanlscott 1501 reporters
103016675 AaronMehta 1278 reporters
46557945 StevenTDennis 1209 reporters
14597239 TonyRomm 1180 reporters
123738314 greggiroux 1167 reporters
47758416 marissaaevans 1167 reporters
19847765 sahilkapur 1035 reporters
16125224 ByronTau 1014 reporters
11771512 OKnox 900 reporters
46555511 Alex_Panetta 899 reporters
275207082 AlexParkerDC 838 reporters
26559241 fordm 823 reporters
29771100 lawrencehurley 803 reporters
225265639 ddale8 760 reporters
23332846 mattzap 738 reporters
16285830 philewing 733 reporters
154562655 KateMereand 732 unknown
493756786 amir_anasr 729 reporters
227790723 RichardRubinDC 704 reporters
46955476 GrahamDavidA 690 reporters
19186003 seungminkim 682 reporters
16061946 kelmej 680 reporters
52392666 ZoeTillman 621 reporters
14362404 bradheath 615 reporters
269911034 YAppelbaum 610 reporters
437019753 TimothyNoah1 609 reporters
300552750 B_resnick 606 reporters
80111587 JeffYoung 590 reporters
230450027 gdebenedetti 576 reporters
16459325 ryanbeckwith 557 reporters
1638925448 A_Childers_ 551 reporters
8083262 jeremybowers 547 reporters
391560579 hshaban 542 reporters
13524182 daveweigel 536 reporters
15463671 samsteinhp 533 reporters
45399148 jeneps 519 reporters
16434028 gabbilevy 517 reporters
16868756 ddiamond 515 unknown
140286364 nielslesniewski 510 reporters
114670081 rebleber 504 reporters
17907987 timkmak 499 reporters
195840597 JNicholsonInDC 499 reporters
46213956 JamilSmith 496 reporters
11125672 dmccabe 495 reporters
217550862 BresPolitico 481 reporters
7768402 ryanjreilly 474 reporters

Replies by account type <----------

In [69]:
reply_join_df.groupby('type').sum()
Out[69]:
reply_count
type
government 393
media 1557
politicians 882
reporters 113377
unknown 280087

Top (by replies) accounts that are not known. <----------

These are the accounts that we will want to categorize.

In [70]:
top_not_known_replies_df = reply_join_df[reply_join_df.type == 'unknown'].sort_values('reply_count', ascending=False)
top_not_known_replies_df[['reply_to_screen_name', 'reply_count']].head(100)
Out[70]:
reply_to_screen_name reply_count
user_id
154562655 KateMereand 732
16868756 ddiamond 515
142721190 elisefoley 468
14412533 CillizzaCNN 461
16244449 jbarro 427
97371315 LoganDobson 421
135575282 morningmoneyben 413
15446531 mattyglesias 406
17243582 blakehounshell 390
51462013 lizzieohreally 383
93069110 maggieNYT 383
218325695 Bencjacobs 359
950531 pbump 352
26133429 Alex_Roarty 338
47455112 lrozen 330
370113964 sangerkatz 314
14096763 TheStalwart 307
105966714 onceuponA 306
75990180 LAASummers 304
48120914 SopanDeb 303
15985111 emptywheel 302
4207961 chrislhayes 302
270457886 Kyle_Feldscher 284
326255267 KFILE 276
79743108 alexburnsNYT 271
15714370 taykuy 270
18164272 NickBaumann 267
190298721 jamespmanley 259
289462808 JoshZumbrun 256
104914594 Phil_Mattingly 256
... ... ...
70424767 SonnyBunch 182
23908154 aedwardslevy 180
86105641 AaronKAlbright 179
21431618 mckaycoppins 173
143991014 brianros1 173
25570250 kristoncapps 169
373522445 BradMossEsq 168
227373977 MarcACaputo 167
12609882 abeaujon 167
2848493051 dave_brown24 166
234879892 Ed_Demaria 165
459872442 RadioFreeTom 164
25911963 charles_gaba 163
60040821 maxjrosenthal 163
18772897 reidepstein 162
888621 wexler 160
14477723 sethdmichaels 159
221563863 Gardner_LM 158
14066024 dceiver 157
15742985 lachlan 156
220911060 LeighGiangreco 156
346082172 LorenAdler 156
2427429092 AlanBalutis 155
21970248 cushbomb 155
452521774 _cingraham 152
14190948 jbouie 151
56480955 NeilShader 151
16405372 PoliticsK12 151
16076032 ggreenwald 150
44688778 swin24 150

100 rows × 2 columns

In [ ]: