import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Simply the tweet on load
def mention_transform(tweet):
mentions = []
if 'retweeted_status' not in tweet and 'quoted_status' not in tweet:
for mention in tweet.get('entities', {}).get('user_mentions', []):
mentions.append({
'tweet_id': tweet['id_str'],
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'mention_user_id': mention['id_str'],
'mention_screen_name': mention['screen_name'],
'tweet_created_at': date_parse(tweet['created_at'])
})
return mentions
mention_df = load_tweet_df(mention_transform, ['tweet_id', 'user_id', 'screen_name', 'mention_user_id',
'mention_screen_name', 'tweet_created_at'])
INFO:root:Loading from tweets/5936930d07c44a1fb22378ef93e07035_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 INFO:root:Loading from tweets/929612cdd2594269a16b590888a17447_001.json.gz DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 DEBUG:root:Loaded 300000 INFO:root:Loading from tweets/adda61cb31f044ffafecacc6331b0e7d_001.json.gz DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000
mention_df[['mention_user_id']].count()
mention_user_id 113355 dtype: int64
Each mention consists of the tweet id, the screen name and user id that is mentioned, and the screen_name and user_id that is mentioning.
mention_df.head()
tweet_id | user_id | screen_name | mention_user_id | mention_screen_name | tweet_created_at | |
---|---|---|---|---|---|---|
1 | 874695069584027648 | 15731368 | HowardKurtz | 171154131 | HappeningNow | 2017-06-13 18:28:50+00:00 |
2 | 874652287809064960 | 15731368 | HowardKurtz | 110445334 | megynkelly | 2017-06-13 15:38:50+00:00 |
3 | 874281339150884864 | 15731368 | HowardKurtz | 486907980 | CarleyShimkus | 2017-06-12 15:04:49+00:00 |
6 | 874265555552882688 | 15731368 | HowardKurtz | 228747444 | TheJuanWilliams | 2017-06-12 14:02:06+00:00 |
8 | 873968413160464384 | 15731368 | HowardKurtz | 3056746427 | JessicaTarlov | 2017-06-11 18:21:21+00:00 |
# From the mentions, extract map of user ids to screen names
mention_user_id_lookup_df = mention_df.loc[mention_df.groupby('mention_user_id')['tweet_created_at'].idxmax()].ix[:,['mention_user_id', 'mention_screen_name']].set_index(['mention_user_id'])
mention_user_id_lookup_df.count()
mention_screen_name 25644 dtype: int64
mention_user_id_lookup_df.head()
mention_screen_name | |
---|---|
mention_user_id | |
100005598 | hotelkeys |
10003742 | hari |
100039156 | briancassella |
1000968684 | bmcushing |
100136358 | Karnythia |
# From the users (not the mentions), extract map of user ids to screen names
user_id_lookup_df = mention_df.loc[mention_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id'])
user_id_lookup_df.count()
screen_name 2056 dtype: int64
# Group by user_id
# This count should match the user_id map count
mention_summary_user_id_df = pd.DataFrame(mention_df.groupby('mention_user_id').size(), columns=['mention_count'])
mention_summary_user_id_df.count()
mention_count 25644 dtype: int64
mention_summary_user_id_df.head()
mention_count | |
---|---|
mention_user_id | |
100005598 | 2 |
10003742 | 4 |
100039156 | 1 |
1000968684 | 1 |
100136358 | 4 |
# Join with user id map
mention_summary_screen_name_df = mention_summary_user_id_df.join(mention_user_id_lookup_df)
mention_summary_screen_name_df.count()
mention_count 25644 mention_screen_name 25644 dtype: int64
mention_summary_screen_name_df.head()
mention_count | mention_screen_name | |
---|---|---|
mention_user_id | ||
100005598 | 2 | hotelkeys |
10003742 | 4 | hari |
100039156 | 1 | briancassella |
1000968684 | 1 | bmcushing |
100136358 | 4 | Karnythia |
# Load lookups of known users
from utils import load_user_type_lookup_df
user_type_lookup_df = load_user_type_lookup_df()[['type']]
user_type_lookup_df.count()
INFO:root:Loading lookups from lookups/senate_press_lookup.csv INFO:root:Loading lookups from lookups/periodical_press_lookup.csv INFO:root:Loading lookups from lookups/radio_and_television_lookup.csv
type 2493 dtype: int64
user_type_lookup_df.head()
type | |
---|---|
user_id | |
23455653 | beltway_journalists |
33919343 | beltway_journalists |
18580432 | beltway_journalists |
399225358 | beltway_journalists |
18834692 | beltway_journalists |
user_type_lookup_df['type'].value_counts()
beltway_journalists 2493 Name: type, dtype: int64
# Join the mentions and the known users
mention_summary_type_df = mention_summary_screen_name_df.join(user_type_lookup_df, how='left')
mention_summary_type_df['type'].fillna('unknown', inplace=True)
mention_summary_type_df.index.name = 'user_id'
mention_summary_type_df.count()
mention_count 25644 mention_screen_name 25644 type 25644 dtype: int64
mention_summary_type_df.head()
mention_count | mention_screen_name | type | |
---|---|---|---|
user_id | |||
100005598 | 2 | hotelkeys | unknown |
10003742 | 4 | hari | unknown |
100039156 | 1 | briancassella | unknown |
1000968684 | 1 | bmcushing | unknown |
100136358 | 4 | Karnythia | unknown |
Which is different than the number of mentions.
mention_user_id_per_user_df = mention_df[['mention_user_id', 'user_id']].drop_duplicates()
mention_user_id_per_user_summary_df = pd.DataFrame(mention_user_id_per_user_df.groupby('mention_user_id').size(), columns=['users_mentioning_count'])
mention_user_id_per_user_summary_df.index.name = 'user_id'
mention_user_id_per_user_summary_df.head()
# Join with mention_summary_type_df
mention_summary_df = mention_summary_type_df.join(mention_user_id_per_user_summary_df)
mention_summary_df['percent_of_users_mentioning'] = mention_summary_df.users_mentioning_count / user_id_lookup_df['screen_name'].count()
mention_summary_df.head()
mention_count | mention_screen_name | type | users_mentioning_count | percent_of_users_mentioning | |
---|---|---|---|---|---|
user_id | |||||
100005598 | 2 | hotelkeys | unknown | 2 | 0.000973 |
10003742 | 4 | hari | unknown | 3 | 0.001459 |
100039156 | 1 | briancassella | unknown | 1 | 0.000486 |
1000968684 | 1 | bmcushing | unknown | 1 | 0.000486 |
100136358 | 4 | Karnythia | unknown | 1 | 0.000486 |
For users that made any mentions. Also to possible to figure this out for all users.
mention_df['user_id'].value_counts().describe()
count 2056.000000 mean 55.133755 std 145.858953 min 1.000000 25% 6.000000 50% 18.000000 75% 58.000000 max 4778.000000 Name: user_id, dtype: float64
mention_grouped_by_users_mentioning_df = mention_summary_df[['mention_count', 'users_mentioning_count']].groupby(by='users_mentioning_count').agg([np.sum, np.size])
mention_grouped_by_users_mentioning_df['cumulative_mention_count_sum'] = mention_grouped_by_users_mentioning_df['mention_count', 'sum'].cumsum()
mention_grouped_by_users_mentioning_df['cumulative_mention_count_sum_percentage'] = mention_grouped_by_users_mentioning_df['cumulative_mention_count_sum'] / mention_grouped_by_users_mentioning_df['mention_count', 'sum'].sum()
mention_grouped_by_users_mentioning_df['cumulative_mentioned_users'] = mention_grouped_by_users_mentioning_df['mention_count', 'size'].cumsum()
mention_grouped_by_users_mentioning_df['cumulative_mentioned_users_percentage'] = mention_grouped_by_users_mentioning_df['cumulative_mentioned_users'] / mention_grouped_by_users_mentioning_df['mention_count', 'size'].sum()
mention_grouped_by_users_mentioning_df
mention_count | cumulative_mention_count_sum | cumulative_mention_count_sum_percentage | cumulative_mentioned_users | cumulative_mentioned_users_percentage | ||
---|---|---|---|---|---|---|
sum | size | |||||
users_mentioning_count | ||||||
1 | 31346 | 19101 | 31346 | 0.276529 | 19101 | 0.744853 |
2 | 9905 | 2898 | 41251 | 0.363910 | 21999 | 0.857861 |
3 | 5803 | 1087 | 47054 | 0.415103 | 23086 | 0.900250 |
4 | 4690 | 600 | 51744 | 0.456477 | 23686 | 0.923647 |
5 | 4402 | 429 | 56146 | 0.495311 | 24115 | 0.940376 |
6 | 3169 | 264 | 59315 | 0.523268 | 24379 | 0.950671 |
7 | 2396 | 194 | 61711 | 0.544405 | 24573 | 0.958236 |
8 | 2159 | 147 | 63870 | 0.563451 | 24720 | 0.963968 |
9 | 1782 | 109 | 65652 | 0.579172 | 24829 | 0.968219 |
10 | 2055 | 106 | 67707 | 0.597301 | 24935 | 0.972352 |
11 | 1854 | 92 | 69561 | 0.613656 | 25027 | 0.975940 |
12 | 1349 | 50 | 70910 | 0.625557 | 25077 | 0.977890 |
13 | 1633 | 60 | 72543 | 0.639963 | 25137 | 0.980229 |
14 | 1518 | 52 | 74061 | 0.653355 | 25189 | 0.982257 |
15 | 1359 | 44 | 75420 | 0.665343 | 25233 | 0.983973 |
16 | 1069 | 34 | 76489 | 0.674774 | 25267 | 0.985299 |
17 | 1516 | 40 | 78005 | 0.688148 | 25307 | 0.986859 |
18 | 810 | 27 | 78815 | 0.695294 | 25334 | 0.987911 |
19 | 1246 | 27 | 80061 | 0.706286 | 25361 | 0.988964 |
20 | 421 | 12 | 80482 | 0.710000 | 25373 | 0.989432 |
21 | 1126 | 24 | 81608 | 0.719933 | 25397 | 0.990368 |
22 | 702 | 15 | 82310 | 0.726126 | 25412 | 0.990953 |
23 | 941 | 19 | 83251 | 0.734427 | 25431 | 0.991694 |
24 | 1197 | 17 | 84448 | 0.744987 | 25448 | 0.992357 |
25 | 595 | 10 | 85043 | 0.750236 | 25458 | 0.992747 |
26 | 977 | 15 | 86020 | 0.758855 | 25473 | 0.993332 |
27 | 642 | 11 | 86662 | 0.764519 | 25484 | 0.993761 |
28 | 782 | 15 | 87444 | 0.771417 | 25499 | 0.994346 |
29 | 1082 | 12 | 88526 | 0.780962 | 25511 | 0.994814 |
30 | 536 | 9 | 89062 | 0.785691 | 25520 | 0.995165 |
... | ... | ... | ... | ... | ... | ... |
53 | 120 | 1 | 99921 | 0.881487 | 25605 | 0.998479 |
55 | 204 | 1 | 100125 | 0.883287 | 25606 | 0.998518 |
56 | 396 | 3 | 100521 | 0.886780 | 25609 | 0.998635 |
57 | 210 | 2 | 100731 | 0.888633 | 25611 | 0.998713 |
58 | 450 | 3 | 101181 | 0.892603 | 25614 | 0.998830 |
59 | 273 | 1 | 101454 | 0.895011 | 25615 | 0.998869 |
60 | 101 | 1 | 101555 | 0.895902 | 25616 | 0.998908 |
61 | 98 | 1 | 101653 | 0.896767 | 25617 | 0.998947 |
62 | 281 | 2 | 101934 | 0.899246 | 25619 | 0.999025 |
64 | 102 | 1 | 102036 | 0.900146 | 25620 | 0.999064 |
67 | 324 | 1 | 102360 | 0.903004 | 25621 | 0.999103 |
68 | 153 | 1 | 102513 | 0.904354 | 25622 | 0.999142 |
70 | 527 | 3 | 103040 | 0.909003 | 25625 | 0.999259 |
72 | 399 | 1 | 103439 | 0.912523 | 25626 | 0.999298 |
74 | 232 | 1 | 103671 | 0.914569 | 25627 | 0.999337 |
79 | 319 | 1 | 103990 | 0.917383 | 25628 | 0.999376 |
82 | 394 | 2 | 104384 | 0.920859 | 25630 | 0.999454 |
83 | 956 | 2 | 105340 | 0.929293 | 25632 | 0.999532 |
86 | 211 | 1 | 105551 | 0.931154 | 25633 | 0.999571 |
105 | 629 | 1 | 106180 | 0.936703 | 25634 | 0.999610 |
113 | 1113 | 1 | 107293 | 0.946522 | 25635 | 0.999649 |
121 | 350 | 1 | 107643 | 0.949610 | 25636 | 0.999688 |
124 | 392 | 1 | 108035 | 0.953068 | 25637 | 0.999727 |
128 | 426 | 1 | 108461 | 0.956826 | 25638 | 0.999766 |
134 | 351 | 1 | 108812 | 0.959922 | 25639 | 0.999805 |
148 | 471 | 1 | 109283 | 0.964077 | 25640 | 0.999844 |
151 | 420 | 1 | 109703 | 0.967783 | 25641 | 0.999883 |
209 | 515 | 1 | 110218 | 0.972326 | 25642 | 0.999922 |
245 | 1407 | 1 | 111625 | 0.984738 | 25643 | 0.999961 |
361 | 1730 | 1 | 113355 | 1.000000 | 25644 | 1.000000 |
80 rows × 6 columns
%matplotlib inline
mention_grouped_by_users_mentioning_df[['cumulative_mention_count_sum_percentage', 'cumulative_mentioned_users_percentage']].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10c9e42e8>
Removes users that were only mentioned by 1 user.
mention_summary_df.drop(mention_summary_df[mention_summary_df.users_mentioning_count == 1].index, inplace=True)
mention_summary_df['mention_screen_name'].count()
6543
Unknown for type indicates that it is not matched with an known Twitter account.
mention_summary_df.sort_values('mention_count', ascending=False).head(50)
mention_count | mention_screen_name | type | users_mentioning_count | percent_of_users_mentioning | |
---|---|---|---|---|---|
user_id | |||||
25073877 | 1730 | realDonaldTrump | unknown | 361 | 0.175584 |
822215679726100480 | 1407 | POTUS | unknown | 245 | 0.119163 |
51241574 | 1113 | AP | unknown | 113 | 0.054961 |
15391102 | 1009 | wusa9 | unknown | 44 | 0.021401 |
14980820 | 845 | nbcwashington | unknown | 43 | 0.020914 |
15754281 | 775 | USATODAY | unknown | 83 | 0.040370 |
18956073 | 661 | dcexaminer | unknown | 50 | 0.024319 |
3108351 | 629 | WSJ | unknown | 105 | 0.051070 |
807095 | 515 | nytimes | unknown | 209 | 0.101654 |
9300262 | 471 | politico | unknown | 148 | 0.071984 |
459277523 | 437 | BloombergBNA | unknown | 35 | 0.017023 |
818927131883356161 | 426 | PressSec | unknown | 128 | 0.062257 |
759251 | 420 | CNN | unknown | 151 | 0.073444 |
1652541 | 399 | Reuters | unknown | 72 | 0.035019 |
1337271 | 394 | darth | unknown | 31 | 0.015078 |
1209417007 | 392 | SteveScalise | unknown | 124 | 0.060311 |
17906632 | 377 | WTOP | unknown | 34 | 0.016537 |
14897840 | 375 | ABC7News | unknown | 24 | 0.011673 |
2467791 | 351 | washingtonpost | unknown | 134 | 0.065175 |
18916432 | 350 | SpeakerRyan | unknown | 121 | 0.058852 |
2836421 | 324 | MSNBC | unknown | 67 | 0.032588 |
823693635564802048 | 319 | SHSanders45 | unknown | 79 | 0.038424 |
13918492 | 288 | fox5dc | unknown | 26 | 0.012646 |
3817401 | 285 | ericgeller | beltway_journalists | 47 | 0.022860 |
15675138 | 273 | cspan | unknown | 59 | 0.028696 |
19576571 | 250 | JaredRizzi | beltway_journalists | 29 | 0.014105 |
564111558 | 248 | bpolitics | unknown | 52 | 0.025292 |
950531 | 237 | pbump | unknown | 37 | 0.017996 |
2312829909 | 234 | CQnow | unknown | 29 | 0.014105 |
15922214 | 232 | rollcall | unknown | 44 | 0.021401 |
1367531 | 232 | FoxNews | unknown | 74 | 0.035992 |
13850422 | 230 | CNNPolitics | unknown | 82 | 0.039883 |
28785486 | 222 | ABC | unknown | 58 | 0.028210 |
398088661 | 218 | MEPFuller | beltway_journalists | 70 | 0.034047 |
90614279 | 216 | EENewsUpdates | unknown | 21 | 0.010214 |
818910970567344128 | 211 | VP | unknown | 86 | 0.041829 |
14173315 | 204 | NBCNews | unknown | 55 | 0.026751 |
800707492346925056 | 190 | axios | unknown | 48 | 0.023346 |
13524182 | 181 | daveweigel | beltway_journalists | 83 | 0.040370 |
22429979 | 177 | nycsouthpaw | unknown | 22 | 0.010700 |
18949452 | 177 | FT | unknown | 17 | 0.008268 |
15012486 | 174 | CBSNews | unknown | 36 | 0.017510 |
2167286587 | 172 | byrdinator | unknown | 40 | 0.019455 |
9624742 | 167 | StateDept | unknown | 39 | 0.018969 |
93069110 | 164 | maggieNYT | unknown | 82 | 0.039883 |
14515799 | 160 | brianstelter | unknown | 56 | 0.027237 |
48120914 | 155 | SopanDeb | unknown | 70 | 0.034047 |
7429102 | 154 | MarkWarner | unknown | 70 | 0.034047 |
15764644 | 153 | NancyPelosi | unknown | 68 | 0.033074 |
14662354 | 152 | WashTimes | unknown | 19 | 0.009241 |
types_by_mention_count_df = mention_summary_df[['type', 'mention_count']].groupby('type').sum()
types_by_mention_count_df['type_percentage']= types_by_mention_count_df['mention_count'] / types_by_mention_count_df['mention_count'].sum()
types_by_mention_count_df.sort_values('mention_count', ascending=False)
mention_count | type_percentage | |
---|---|---|
type | ||
unknown | 67652 | 0.824934 |
beltway_journalists | 14357 | 0.175066 |
Mentions by type per user.
mention_all_join_df = pd.merge(mention_df, user_type_lookup_df[['type']], how='left', left_on='mention_user_id', right_index=True)
mention_all_join_df['type'].fillna('unknown', inplace=True)
# Drop tail
mention_all_join_limited_df = mention_all_join_df[mention_all_join_df.mention_user_id.isin(mention_summary_df.index)]
mention_all_join_limited_df.head()
tweet_id | user_id | screen_name | mention_user_id | mention_screen_name | tweet_created_at | type | |
---|---|---|---|---|---|---|---|
1 | 874695069584027648 | 15731368 | HowardKurtz | 171154131 | HappeningNow | 2017-06-13 18:28:50+00:00 | unknown |
2 | 874652287809064960 | 15731368 | HowardKurtz | 110445334 | megynkelly | 2017-06-13 15:38:50+00:00 | unknown |
6 | 874265555552882688 | 15731368 | HowardKurtz | 228747444 | TheJuanWilliams | 2017-06-12 14:02:06+00:00 | unknown |
8 | 873968413160464384 | 15731368 | HowardKurtz | 3056746427 | JessicaTarlov | 2017-06-11 18:21:21+00:00 | unknown |
9 | 873955901484486656 | 15731368 | HowardKurtz | 16157855 | edhenry | 2017-06-11 17:31:38+00:00 | beltway_journalists |
mention_summary_by_user_df = mention_all_join_limited_df.groupby([mention_all_join_limited_df.user_id, mention_all_join_limited_df.type]).size().unstack().fillna(0)
# Add a total column
mention_summary_by_user_df['total'] = mention_summary_by_user_df.sum(axis=1)
for col_name in mention_summary_by_user_df.columns[:-1]:
mention_summary_by_user_df['{}_percent'.format(col_name)] = mention_summary_by_user_df[col_name] / mention_summary_by_user_df.total
mention_summary_by_user_df.head(10)
type | beltway_journalists | unknown | total | beltway_journalists_percent | unknown_percent |
---|---|---|---|---|---|
user_id | |||||
1001991865 | 0.0 | 1.0 | 1.0 | 0.000000 | 1.000000 |
1002229862 | 2.0 | 12.0 | 14.0 | 0.142857 | 0.857143 |
100802089 | 0.0 | 3.0 | 3.0 | 0.000000 | 1.000000 |
100860790 | 7.0 | 22.0 | 29.0 | 0.241379 | 0.758621 |
1009749229 | 5.0 | 21.0 | 26.0 | 0.192308 | 0.807692 |
1013785220 | 2.0 | 11.0 | 13.0 | 0.153846 | 0.846154 |
10162772 | 2.0 | 48.0 | 50.0 | 0.040000 | 0.960000 |
102171691 | 29.0 | 266.0 | 295.0 | 0.098305 | 0.901695 |
1025521 | 13.0 | 43.0 | 56.0 | 0.232143 | 0.767857 |
102718971 | 0.0 | 8.0 | 8.0 | 0.000000 | 1.000000 |
That is, for each user determine the percent of mentions by type. Then take the average of each type.
Thus, this mention analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.)
mention_summary_by_user_df.filter(axis=1, regex="_percent$").mean()
type beltway_journalists_percent 0.170117 unknown_percent 0.829883 dtype: float64
The number of users that mentioned an account. Thus, each user counts as 1, even if that user made multiple mentions of the account.
This weights an account that is mentioned by a 100 users more heavily than an account that is mentioned a 100 times by a single user.
mention_summary_df.sort_values('users_mentioning_count', ascending=False).head(20)
mention_count | mention_screen_name | type | users_mentioning_count | percent_of_users_mentioning | |
---|---|---|---|---|---|
user_id | |||||
25073877 | 1730 | realDonaldTrump | unknown | 361 | 0.175584 |
822215679726100480 | 1407 | POTUS | unknown | 245 | 0.119163 |
807095 | 515 | nytimes | unknown | 209 | 0.101654 |
759251 | 420 | CNN | unknown | 151 | 0.073444 |
9300262 | 471 | politico | unknown | 148 | 0.071984 |
2467791 | 351 | washingtonpost | unknown | 134 | 0.065175 |
818927131883356161 | 426 | PressSec | unknown | 128 | 0.062257 |
1209417007 | 392 | SteveScalise | unknown | 124 | 0.060311 |
18916432 | 350 | SpeakerRyan | unknown | 121 | 0.058852 |
51241574 | 1113 | AP | unknown | 113 | 0.054961 |
3108351 | 629 | WSJ | unknown | 105 | 0.051070 |
818910970567344128 | 211 | VP | unknown | 86 | 0.041829 |
13524182 | 181 | daveweigel | beltway_journalists | 83 | 0.040370 |
15754281 | 775 | USATODAY | unknown | 83 | 0.040370 |
13850422 | 230 | CNNPolitics | unknown | 82 | 0.039883 |
93069110 | 164 | maggieNYT | unknown | 82 | 0.039883 |
823693635564802048 | 319 | SHSanders45 | unknown | 79 | 0.038424 |
1367531 | 232 | FoxNews | unknown | 74 | 0.035992 |
1652541 | 399 | Reuters | unknown | 72 | 0.035019 |
398088661 | 218 | MEPFuller | beltway_journalists | 70 | 0.034047 |
types_by_users_mentioning_df = mention_summary_df[['type', 'users_mentioning_count']].groupby('type').sum()
types_by_users_mentioning_df['type_percentage']= types_by_users_mentioning_df['users_mentioning_count'] / types_by_users_mentioning_df['users_mentioning_count'].sum()
types_by_users_mentioning_df.sort_values('users_mentioning_count', ascending=False)
users_mentioning_count | type_percentage | |
---|---|---|
type | ||
unknown | 29124 | 0.793699 |
beltway_journalists | 7570 | 0.206301 |
Remember, the tail has been cut off
mention_summary_df[mention_summary_df.type != 'unknown'].count()
mention_count 959 mention_screen_name 959 type 959 users_mentioning_count 959 percent_of_users_mentioning 959 dtype: int64
not_known_mention_df = mention_summary_df[mention_summary_df.type == 'unknown']
not_known_mention_df.count()
mention_count 5584 mention_screen_name 5584 type 5584 users_mentioning_count 5584 percent_of_users_mentioning 5584 dtype: int64
not_known_mention_sorted_df = not_known_mention_df.sort_values('mention_count', ascending=False)[['mention_screen_name', 'mention_count', 'users_mentioning_count']]
not_known_mention_sorted_df.head(20)
mention_screen_name | mention_count | users_mentioning_count | |
---|---|---|---|
user_id | |||
25073877 | realDonaldTrump | 1730 | 361 |
822215679726100480 | POTUS | 1407 | 245 |
51241574 | AP | 1113 | 113 |
15391102 | wusa9 | 1009 | 44 |
14980820 | nbcwashington | 845 | 43 |
15754281 | USATODAY | 775 | 83 |
18956073 | dcexaminer | 661 | 50 |
3108351 | WSJ | 629 | 105 |
807095 | nytimes | 515 | 209 |
9300262 | politico | 471 | 148 |
459277523 | BloombergBNA | 437 | 35 |
818927131883356161 | PressSec | 426 | 128 |
759251 | CNN | 420 | 151 |
1652541 | Reuters | 399 | 72 |
1337271 | darth | 394 | 31 |
1209417007 | SteveScalise | 392 | 124 |
17906632 | WTOP | 377 | 34 |
14897840 | ABC7News | 375 | 24 |
2467791 | washingtonpost | 351 | 134 |
18916432 | SpeakerRyan | 350 | 121 |
not_known_mention_sorted_df.to_csv('output/unknown_mentions.csv')