Hereafter referring to retweets and quotes as retweets.
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Simply the tweet on load
def retweet_transform(tweet):
retweet = tweet.get('retweeted_status') or tweet.get('quoted_status')
if retweet:
return {
'tweet_id': tweet['id_str'],
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'retweet_user_id': retweet['user']['id_str'],
'retweet_screen_name': retweet['user']['screen_name'],
'tweet_created_at': date_parse(tweet['created_at'])
}
return None
retweet_df = load_tweet_df(retweet_transform, ['tweet_id', 'user_id', 'screen_name', 'retweet_user_id',
'retweet_screen_name', 'tweet_created_at'])
INFO:root:Loading from tweets/6eea2088e010437da4b6031c2abffdc9_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 DEBUG:root:Loaded 300000 INFO:root:Loading from tweets/a7bcdbde7a104285b92fe26e286f2543_001.json.gz DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 INFO:root:Loading from tweets/e1c824ff2b3c4c5a9a93a16e5036d09a_001.json.gz DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000
retweet_df[['retweet_user_id']].count()
retweet_user_id 398988 dtype: int64
Each retweet consists of the tweet id, the screen name and user id that is retweeting, and the screen_name and user_id that is retweeted.
retweet_df.head()
tweet_id | user_id | screen_name | retweet_user_id | retweet_screen_name | tweet_created_at | |
---|---|---|---|---|---|---|
0 | 847787664963239936 | 285772181 | akesslerdc | 85131054 | jeffzeleny | 2017-03-31 12:28:25+00:00 |
1 | 847634105118318594 | 285772181 | akesslerdc | 128558424 | erin_pelton | 2017-03-31 02:18:13+00:00 |
2 | 847617579627630592 | 285772181 | akesslerdc | 318502583 | ksacknyt | 2017-03-31 01:12:33+00:00 |
3 | 847601029654880258 | 285772181 | akesslerdc | 58504135 | shaneharris | 2017-03-31 00:06:47+00:00 |
4 | 847388672785694720 | 285772181 | akesslerdc | 22772264 | carolelee | 2017-03-30 10:02:57+00:00 |
# From the retweets, extract map of user ids to screen names
retweet_user_id_lookup_df = retweet_df.loc[retweet_df.groupby('retweet_user_id')['tweet_created_at'].idxmax()].ix[:,['retweet_user_id', 'retweet_screen_name']].set_index(['retweet_user_id'])
retweet_user_id_lookup_df.count()
retweet_screen_name 45104 dtype: int64
retweet_user_id_lookup_df.head()
retweet_screen_name | |
---|---|
retweet_user_id | |
100002112 | whyyradiotimes |
100005598 | hotelkeys |
1000228238 | adwooldridge |
100026898 | tvnewzted |
1000318321 | AJGuglielmi |
# From the users (not the retweets), extract map of user ids to screen names
user_id_lookup_df = retweet_df.loc[retweet_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id'])
user_id_lookup_df.count()
screen_name 1836 dtype: int64
# Group by user_id
# This count should match the user_id map count
retweet_summary_user_id_df = pd.DataFrame(retweet_df.groupby('retweet_user_id').size(), columns=['retweet_count'])
retweet_summary_user_id_df.count()
retweet_count 45104 dtype: int64
retweet_summary_user_id_df.head()
retweet_count | |
---|---|
retweet_user_id | |
100002112 | 1 |
100005598 | 1 |
1000228238 | 2 |
100026898 | 1 |
1000318321 | 2 |
# Join with user id map
retweet_summary_screen_name_df = retweet_summary_user_id_df.join(retweet_user_id_lookup_df)
retweet_summary_screen_name_df.count()
retweet_count 45104 retweet_screen_name 45104 dtype: int64
retweet_summary_screen_name_df.head()
retweet_count | retweet_screen_name | |
---|---|---|
retweet_user_id | ||
100002112 | 1 | whyyradiotimes |
100005598 | 1 | hotelkeys |
1000228238 | 2 | adwooldridge |
100026898 | 1 | tvnewzted |
1000318321 | 2 | AJGuglielmi |
# Load lookups of known users
from utils import load_user_type_lookup_df
user_type_lookup_df = load_user_type_lookup_df()[['type']]
user_type_lookup_df.count()
type 13160 dtype: int64
user_type_lookup_df.head()
type | |
---|---|
user_id | |
2345626885 | journalists |
780221130 | journalists |
285772181 | journalists |
29607664 | journalists |
9484732 | journalists |
user_type_lookup_df['type'].value_counts()
media 4538 journalists 3576 government 3055 politicians 817 ngo 250 pundit 195 other 160 other_political 156 cultural 131 academic 129 business 125 foreign_political 28 Name: type, dtype: int64
# Join the retweets and the known users
retweet_summary_type_df = retweet_summary_screen_name_df.join(user_type_lookup_df, how='left')
retweet_summary_type_df['type'].fillna('unknown', inplace=True)
retweet_summary_type_df.index.name = 'user_id'
retweet_summary_type_df.count()
retweet_count 45104 retweet_screen_name 45104 type 45104 dtype: int64
retweet_summary_type_df.head()
retweet_count | retweet_screen_name | type | |
---|---|---|---|
user_id | |||
100002112 | 1 | whyyradiotimes | unknown |
100005598 | 1 | hotelkeys | unknown |
1000228238 | 2 | adwooldridge | unknown |
100026898 | 1 | tvnewzted | unknown |
1000318321 | 2 | AJGuglielmi | unknown |
Which is different than the number of retweets.
retweet_user_id_per_user_df = retweet_df[['retweet_user_id', 'user_id']].drop_duplicates()
retweet_user_id_per_user_summary_df = pd.DataFrame(retweet_user_id_per_user_df.groupby('retweet_user_id').size(), columns=['users_retweeting_count'])
retweet_user_id_per_user_summary_df.index.name = 'user_id'
retweet_user_id_per_user_summary_df.head()
# Join with retweet_summary_type_df
retweet_summary_df = retweet_summary_type_df.join(retweet_user_id_per_user_summary_df)
retweet_summary_df['percent_of_users_retweeting'] = retweet_summary_df.users_retweeting_count / user_id_lookup_df['screen_name'].count()
retweet_summary_df.head()
retweet_count | retweet_screen_name | type | users_retweeting_count | percent_of_users_retweeting | |
---|---|---|---|---|---|
user_id | |||||
100002112 | 1 | whyyradiotimes | unknown | 1 | 0.000545 |
100005598 | 1 | hotelkeys | unknown | 1 | 0.000545 |
1000228238 | 2 | adwooldridge | unknown | 2 | 0.001089 |
100026898 | 1 | tvnewzted | unknown | 1 | 0.000545 |
1000318321 | 2 | AJGuglielmi | unknown | 2 | 0.001089 |
For users that made any retweets. Also to possible to figure this out for all users.
retweet_df['user_id'].value_counts().describe()
count 1836.000000 mean 217.313725 std 456.459939 min 1.000000 25% 17.000000 50% 70.000000 75% 219.000000 max 6483.000000 Name: user_id, dtype: float64
retweet_grouped_by_users_retweeting_df = retweet_summary_df[['retweet_count', 'users_retweeting_count']].groupby(by='users_retweeting_count').agg([np.sum, np.size])
retweet_grouped_by_users_retweeting_df['cumulative_retweet_count_sum'] = retweet_grouped_by_users_retweeting_df['retweet_count', 'sum'].cumsum()
retweet_grouped_by_users_retweeting_df['cumulative_retweet_count_sum_percentage'] = retweet_grouped_by_users_retweeting_df['cumulative_retweet_count_sum'] / retweet_grouped_by_users_retweeting_df['retweet_count', 'sum'].sum()
retweet_grouped_by_users_retweeting_df['cumulative_retweeted_users'] = retweet_grouped_by_users_retweeting_df['retweet_count', 'size'].cumsum()
retweet_grouped_by_users_retweeting_df['cumulative_retweeted_users_percentage'] = retweet_grouped_by_users_retweeting_df['cumulative_retweeted_users'] / retweet_grouped_by_users_retweeting_df['retweet_count', 'size'].sum()
retweet_grouped_by_users_retweeting_df
retweet_count | cumulative_retweet_count_sum | cumulative_retweet_count_sum_percentage | cumulative_retweeted_users | cumulative_retweeted_users_percentage | ||
---|---|---|---|---|---|---|
sum | size | |||||
users_retweeting_count | ||||||
1 | 38749 | 28998 | 38749 | 0.097118 | 28998 | 0.642914 |
2 | 15860 | 5534 | 54609 | 0.136869 | 34532 | 0.765608 |
3 | 11716 | 2592 | 66325 | 0.166233 | 37124 | 0.823076 |
4 | 9112 | 1536 | 75437 | 0.189071 | 38660 | 0.857130 |
5 | 8530 | 1070 | 83967 | 0.210450 | 39730 | 0.880853 |
6 | 6824 | 703 | 90791 | 0.227553 | 40433 | 0.896439 |
7 | 6330 | 541 | 97121 | 0.243418 | 40974 | 0.908434 |
8 | 5962 | 436 | 103083 | 0.258361 | 41410 | 0.918100 |
9 | 4608 | 316 | 107691 | 0.269910 | 41726 | 0.925106 |
10 | 4535 | 267 | 112226 | 0.281277 | 41993 | 0.931026 |
11 | 4341 | 249 | 116567 | 0.292157 | 42242 | 0.936547 |
12 | 4249 | 208 | 120816 | 0.302806 | 42450 | 0.941158 |
13 | 4568 | 194 | 125384 | 0.314255 | 42644 | 0.945459 |
14 | 4154 | 173 | 129538 | 0.324666 | 42817 | 0.949295 |
15 | 3481 | 135 | 133019 | 0.333391 | 42952 | 0.952288 |
16 | 2806 | 104 | 135825 | 0.340424 | 43056 | 0.954594 |
17 | 3368 | 97 | 139193 | 0.348865 | 43153 | 0.956744 |
18 | 3010 | 92 | 142203 | 0.356409 | 43245 | 0.958784 |
19 | 2537 | 76 | 144740 | 0.362768 | 43321 | 0.960469 |
20 | 3361 | 84 | 148101 | 0.371192 | 43405 | 0.962332 |
21 | 2955 | 85 | 151056 | 0.378598 | 43490 | 0.964216 |
22 | 2963 | 79 | 154019 | 0.386024 | 43569 | 0.965968 |
23 | 2508 | 58 | 156527 | 0.392310 | 43627 | 0.967253 |
24 | 2467 | 56 | 158994 | 0.398493 | 43683 | 0.968495 |
25 | 2514 | 54 | 161508 | 0.404794 | 43737 | 0.969692 |
26 | 2566 | 55 | 164074 | 0.411225 | 43792 | 0.970912 |
27 | 2349 | 41 | 166423 | 0.417113 | 43833 | 0.971821 |
28 | 2665 | 47 | 169088 | 0.423792 | 43880 | 0.972863 |
29 | 2575 | 43 | 171663 | 0.430246 | 43923 | 0.973816 |
30 | 2473 | 44 | 174136 | 0.436444 | 43967 | 0.974792 |
... | ... | ... | ... | ... | ... | ... |
255 | 582 | 1 | 349924 | 0.877029 | 45068 | 0.999202 |
257 | 856 | 1 | 350780 | 0.879174 | 45069 | 0.999224 |
259 | 1237 | 1 | 352017 | 0.882275 | 45070 | 0.999246 |
261 | 885 | 1 | 352902 | 0.884493 | 45071 | 0.999268 |
263 | 1045 | 1 | 353947 | 0.887112 | 45072 | 0.999291 |
266 | 2177 | 2 | 356124 | 0.892568 | 45074 | 0.999335 |
268 | 2059 | 2 | 358183 | 0.897729 | 45076 | 0.999379 |
269 | 987 | 1 | 359170 | 0.900203 | 45077 | 0.999401 |
272 | 861 | 1 | 360031 | 0.902360 | 45078 | 0.999424 |
279 | 679 | 1 | 360710 | 0.904062 | 45079 | 0.999446 |
287 | 1984 | 2 | 362694 | 0.909035 | 45081 | 0.999490 |
293 | 1254 | 1 | 363948 | 0.912178 | 45082 | 0.999512 |
294 | 2286 | 2 | 366234 | 0.917907 | 45084 | 0.999557 |
298 | 969 | 1 | 367203 | 0.920336 | 45085 | 0.999579 |
300 | 1119 | 1 | 368322 | 0.923141 | 45086 | 0.999601 |
303 | 1260 | 1 | 369582 | 0.926299 | 45087 | 0.999623 |
308 | 2057 | 2 | 371639 | 0.931454 | 45089 | 0.999667 |
312 | 2311 | 2 | 373950 | 0.937246 | 45091 | 0.999712 |
338 | 960 | 1 | 374910 | 0.939652 | 45092 | 0.999734 |
360 | 1352 | 1 | 376262 | 0.943041 | 45093 | 0.999756 |
363 | 1173 | 1 | 377435 | 0.945981 | 45094 | 0.999778 |
366 | 1876 | 1 | 379311 | 0.950683 | 45095 | 0.999800 |
398 | 3005 | 2 | 382316 | 0.958214 | 45097 | 0.999845 |
403 | 2398 | 1 | 384714 | 0.964224 | 45098 | 0.999867 |
411 | 1679 | 1 | 386393 | 0.968433 | 45099 | 0.999889 |
437 | 1632 | 1 | 388025 | 0.972523 | 45100 | 0.999911 |
483 | 2085 | 1 | 390110 | 0.977749 | 45101 | 0.999933 |
489 | 2508 | 1 | 392618 | 0.984035 | 45102 | 0.999956 |
492 | 2715 | 1 | 395333 | 0.990839 | 45103 | 0.999978 |
603 | 3655 | 1 | 398988 | 1.000000 | 45104 | 1.000000 |
229 rows × 6 columns
%matplotlib inline
retweet_grouped_by_users_retweeting_df[['cumulative_retweet_count_sum_percentage', 'cumulative_retweeted_users_percentage']].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x11062fb70>
Removes users that were only retweeted by 5 or less users.
retweet_summary_df.drop(retweet_summary_df[retweet_summary_df.users_retweeting_count <= 5].index, inplace=True)
retweet_summary_df['retweet_screen_name'].count()
5374
Unknown for type indicates that it is not matched with an known Twitter account.
retweet_summary_df.sort_values('retweet_count', ascending=False).head(50)
retweet_count | retweet_screen_name | type | users_retweeting_count | percent_of_users_retweeting | |
---|---|---|---|---|---|
user_id | |||||
25073877 | 3655 | realDonaldTrump | politicians | 603 | 0.328431 |
93069110 | 2715 | maggieNYT | journalists | 492 | 0.267974 |
299802277 | 2508 | BraddJaffy | media | 489 | 0.266340 |
1917731 | 2398 | thehill | media | 403 | 0.219499 |
51241574 | 2085 | AP | media | 483 | 0.263072 |
759251 | 1876 | CNN | media | 366 | 0.199346 |
9300262 | 1683 | politico | media | 398 | 0.216776 |
14529929 | 1679 | jaketapper | journalists | 411 | 0.223856 |
2467791 | 1632 | washingtonpost | media | 437 | 0.238017 |
398088661 | 1418 | MEPFuller | journalists | 266 | 0.144880 |
21316253 | 1352 | ZekeJMiller | journalists | 360 | 0.196078 |
807095 | 1322 | nytimes | media | 398 | 0.216776 |
104914594 | 1260 | Phil_Mattingly | journalists | 303 | 0.165033 |
46557945 | 1254 | StevenTDennis | journalists | 293 | 0.159586 |
426802833 | 1237 | AP_Politics | media | 259 | 0.141068 |
13524182 | 1220 | daveweigel | journalists | 294 | 0.160131 |
13850422 | 1206 | CNNPolitics | media | 268 | 0.145969 |
86129724 | 1173 | costareports | journalists | 363 | 0.197712 |
1652541 | 1171 | Reuters | media | 312 | 0.169935 |
34713362 | 1161 | business | media | 246 | 0.133987 |
15433452 | 1140 | JenniferJJacobs | journalists | 312 | 0.169935 |
23232204 | 1124 | ShaneGoldmacher | journalists | 287 | 0.156318 |
32871086 | 1119 | kylegriffin1 | journalists | 300 | 0.163399 |
31127446 | 1066 | markknoller | journalists | 294 | 0.160131 |
39155029 | 1048 | mkraju | journalists | 308 | 0.167756 |
19847765 | 1045 | sahilkapur | journalists | 263 | 0.143246 |
407013776 | 1035 | burgessev | journalists | 226 | 0.123094 |
1367531 | 1012 | FoxNews | media | 232 | 0.126362 |
21252618 | 1009 | JakeSherman | journalists | 308 | 0.167756 |
16187637 | 987 | ChadPergram | journalists | 269 | 0.146514 |
259395895 | 969 | JohnJHarwood | journalists | 298 | 0.162309 |
19107878 | 960 | GlennThrush | journalists | 338 | 0.184096 |
90614279 | 906 | EENewsUpdates | media | 36 | 0.019608 |
19186003 | 892 | seungminkim | journalists | 238 | 0.129630 |
217550862 | 885 | BresPolitico | journalists | 261 | 0.142157 |
28785486 | 861 | ABC | media | 272 | 0.148148 |
59331128 | 860 | PhilipRucker | journalists | 287 | 0.156318 |
17243582 | 856 | blakehounshell | journalists | 257 | 0.139978 |
38936142 | 853 | jdawsey1 | journalists | 268 | 0.145969 |
136550204 | 843 | scottwongDC | journalists | 202 | 0.110022 |
33653195 | 817 | ericawerner | journalists | 246 | 0.133987 |
15446531 | 796 | mattyglesias | journalists | 158 | 0.086057 |
15463671 | 791 | samsteinhp | journalists | 251 | 0.136710 |
18678924 | 767 | jmartNYT | journalists | 219 | 0.119281 |
18956073 | 761 | dcexaminer | media | 105 | 0.057190 |
22129280 | 759 | jimsciutto | journalists | 266 | 0.144880 |
2312829909 | 716 | CQnow | media | 61 | 0.033224 |
326255267 | 713 | KFILE | journalists | 251 | 0.136710 |
22891564 | 697 | chrisgeidner | journalists | 189 | 0.102941 |
15012486 | 686 | CBSNews | media | 204 | 0.111111 |
types_by_retweet_count_df = retweet_summary_df[['type', 'retweet_count']].groupby('type').sum()
types_by_retweet_count_df['type_percentage']= types_by_retweet_count_df['retweet_count'] / types_by_retweet_count_df['retweet_count'].sum()
types_by_retweet_count_df.sort_values('retweet_count', ascending=False)
retweet_count | type_percentage | |
---|---|---|
type | ||
journalists | 185494 | 0.588831 |
media | 55239 | 0.175350 |
unknown | 38893 | 0.123462 |
politicians | 13312 | 0.042258 |
academic | 4294 | 0.013631 |
ngo | 4070 | 0.012920 |
pundit | 4053 | 0.012866 |
other_political | 3817 | 0.012117 |
government | 2320 | 0.007365 |
cultural | 1367 | 0.004339 |
business | 1105 | 0.003508 |
other | 870 | 0.002762 |
foreign_political | 187 | 0.000594 |
Retweets by type per user.
retweet_all_join_df = pd.merge(retweet_df, user_type_lookup_df[['type']], how='left', left_on='retweet_user_id', right_index=True)
retweet_all_join_df['type'].fillna('unknown', inplace=True)
# Drop tail
retweet_all_join_limited_df = retweet_all_join_df[retweet_all_join_df.retweet_user_id.isin(retweet_summary_df.index)]
retweet_all_join_limited_df.head()
tweet_id | user_id | screen_name | retweet_user_id | retweet_screen_name | tweet_created_at | type | |
---|---|---|---|---|---|---|---|
0 | 847787664963239936 | 285772181 | akesslerdc | 85131054 | jeffzeleny | 2017-03-31 12:28:25+00:00 | journalists |
1 | 847634105118318594 | 285772181 | akesslerdc | 128558424 | erin_pelton | 2017-03-31 02:18:13+00:00 | unknown |
3 | 847601029654880258 | 285772181 | akesslerdc | 58504135 | shaneharris | 2017-03-31 00:06:47+00:00 | journalists |
4 | 847388672785694720 | 285772181 | akesslerdc | 22772264 | carolelee | 2017-03-30 10:02:57+00:00 | journalists |
5 | 847200340613189633 | 285772181 | akesslerdc | 23911915 | joshledermanAP | 2017-03-29 21:34:36+00:00 | journalists |
retweet_summary_by_user_df = retweet_all_join_limited_df.groupby([retweet_all_join_limited_df.user_id, retweet_all_join_limited_df.type]).size().unstack().fillna(0)
# Add a total column
retweet_summary_by_user_df['total'] = retweet_summary_by_user_df.sum(axis=1)
for col_name in retweet_summary_by_user_df.columns[:-1]:
retweet_summary_by_user_df['{}_percent'.format(col_name)] = retweet_summary_by_user_df[col_name] / retweet_summary_by_user_df.total
retweet_summary_by_user_df.head(10)
type | academic | business | cultural | foreign_political | government | journalists | media | ngo | other | other_political | ... | foreign_political_percent | government_percent | journalists_percent | media_percent | ngo_percent | other_percent | other_political_percent | politicians_percent | pundit_percent | unknown_percent |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
100165378 | 1.0 | 1.0 | 2.0 | 0.0 | 0.0 | 17.0 | 7.0 | 0.0 | 0.0 | 1.0 | ... | 0.000000 | 0.000000 | 0.386364 | 0.159091 | 0.000000 | 0.0000 | 0.022727 | 0.068182 | 0.068182 | 0.204545 |
1001991865 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 13.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.235294 | 0.764706 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
1002229862 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 58.0 | 10.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.012821 | 0.743590 | 0.128205 | 0.000000 | 0.0000 | 0.000000 | 0.012821 | 0.000000 | 0.102564 |
100802089 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.571429 | 0.142857 | 0.000000 | 0.0000 | 0.000000 | 0.142857 | 0.000000 | 0.142857 |
100860790 | 2.0 | 0.0 | 0.0 | 1.0 | 1.0 | 93.0 | 39.0 | 1.0 | 0.0 | 2.0 | ... | 0.004717 | 0.004717 | 0.438679 | 0.183962 | 0.004717 | 0.0000 | 0.009434 | 0.014151 | 0.004717 | 0.325472 |
1009749229 | 4.0 | 0.0 | 0.0 | 0.0 | 6.0 | 133.0 | 14.0 | 10.0 | 4.0 | 0.0 | ... | 0.000000 | 0.029851 | 0.661692 | 0.069652 | 0.049751 | 0.0199 | 0.000000 | 0.000000 | 0.019900 | 0.129353 |
1013785220 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20.0 | 6.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.540541 | 0.162162 | 0.000000 | 0.0000 | 0.000000 | 0.108108 | 0.000000 | 0.189189 |
102171691 | 17.0 | 2.0 | 1.0 | 0.0 | 1.0 | 494.0 | 65.0 | 9.0 | 0.0 | 19.0 | ... | 0.000000 | 0.001429 | 0.705714 | 0.092857 | 0.012857 | 0.0000 | 0.027143 | 0.021429 | 0.004286 | 0.105714 |
102238997 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
102789488 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 28.0 | 1.0 | 0.0 | 0.0 | 1.0 | ... | 0.000000 | 0.000000 | 0.848485 | 0.030303 | 0.000000 | 0.0000 | 0.030303 | 0.000000 | 0.000000 | 0.090909 |
10 rows × 27 columns
That is, for each user determine the percent of retweets by type. Then take the average of each type.
Thus, this retweet analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.)
retweet_summary_by_user_df.filter(axis=1, regex="_percent$").mean()
type academic_percent 0.011965 business_percent 0.003702 cultural_percent 0.006935 foreign_political_percent 0.000697 government_percent 0.011508 journalists_percent 0.509036 media_percent 0.232894 ngo_percent 0.013284 other_percent 0.003179 other_political_percent 0.007644 politicians_percent 0.043010 pundit_percent 0.007771 unknown_percent 0.148374 dtype: float64
The number of users that retweeted an account. Thus, each user counts as 1, even if that user made multiple retweets of the account.
This weights an account that is retweeted by a 100 users more heavily than an account that is retweeted a 100 times by a single user.
retweet_summary_df.sort_values('users_retweeting_count', ascending=False).head(20)
retweet_count | retweet_screen_name | type | users_retweeting_count | percent_of_users_retweeting | |
---|---|---|---|---|---|
user_id | |||||
25073877 | 3655 | realDonaldTrump | politicians | 603 | 0.328431 |
93069110 | 2715 | maggieNYT | journalists | 492 | 0.267974 |
299802277 | 2508 | BraddJaffy | media | 489 | 0.266340 |
51241574 | 2085 | AP | media | 483 | 0.263072 |
2467791 | 1632 | washingtonpost | media | 437 | 0.238017 |
14529929 | 1679 | jaketapper | journalists | 411 | 0.223856 |
1917731 | 2398 | thehill | media | 403 | 0.219499 |
9300262 | 1683 | politico | media | 398 | 0.216776 |
807095 | 1322 | nytimes | media | 398 | 0.216776 |
759251 | 1876 | CNN | media | 366 | 0.199346 |
86129724 | 1173 | costareports | journalists | 363 | 0.197712 |
21316253 | 1352 | ZekeJMiller | journalists | 360 | 0.196078 |
19107878 | 960 | GlennThrush | journalists | 338 | 0.184096 |
15433452 | 1140 | JenniferJJacobs | journalists | 312 | 0.169935 |
1652541 | 1171 | Reuters | media | 312 | 0.169935 |
39155029 | 1048 | mkraju | journalists | 308 | 0.167756 |
21252618 | 1009 | JakeSherman | journalists | 308 | 0.167756 |
104914594 | 1260 | Phil_Mattingly | journalists | 303 | 0.165033 |
32871086 | 1119 | kylegriffin1 | journalists | 300 | 0.163399 |
259395895 | 969 | JohnJHarwood | journalists | 298 | 0.162309 |
types_by_users_retweeting_df = retweet_summary_df[['type', 'users_retweeting_count']].groupby('type').sum()
types_by_users_retweeting_df['type_percentage']= types_by_users_retweeting_df['users_retweeting_count'] / types_by_users_retweeting_df['users_retweeting_count'].sum()
types_by_users_retweeting_df.sort_values('users_retweeting_count', ascending=False)
users_retweeting_count | type_percentage | |
---|---|---|
type | ||
journalists | 79291 | 0.559404 |
unknown | 25236 | 0.178042 |
media | 17631 | 0.124388 |
politicians | 7300 | 0.051502 |
other_political | 2340 | 0.016509 |
ngo | 2242 | 0.015817 |
academic | 2116 | 0.014929 |
pundit | 1961 | 0.013835 |
government | 1485 | 0.010477 |
cultural | 840 | 0.005926 |
business | 711 | 0.005016 |
other | 450 | 0.003175 |
foreign_political | 139 | 0.000981 |
Remember, the tail has been cut off
retweet_summary_df[retweet_summary_df.type == 'unknown'].count()
retweet_count 2167 retweet_screen_name 2167 type 2167 users_retweeting_count 2167 percent_of_users_retweeting 2167 dtype: int64
retweet_summary_df[retweet_summary_df.type != 'unknown'].count()
retweet_count 3207 retweet_screen_name 3207 type 3207 users_retweeting_count 3207 percent_of_users_retweeting 3207 dtype: int64
top_not_known_retweet_df = retweet_summary_df[(retweet_summary_df.type == 'unknown') & (retweet_summary_df.users_retweeting_count >= 5)].sort_values('retweet_count', ascending=False)[['retweet_screen_name', 'retweet_count', 'users_retweeting_count']]
top_not_known_retweet_df.head(50)
retweet_screen_name | retweet_count | users_retweeting_count | |
---|---|---|---|
user_id | |||
26574283 | CNBCnow | 210 | 85 |
18028893 | JesseRodriguez | 195 | 104 |
36397873 | FoxNewsResearch | 190 | 13 |
327484803 | WSJCentralBanks | 189 | 9 |
2316383071 | RVAwonk | 166 | 60 |
39279821 | brianklaas | 162 | 75 |
3066084185 | APBusiness | 158 | 29 |
738767160395321345 | ChadBown | 151 | 12 |
15110357 | ReutersBiz | 118 | 38 |
218347440 | chrisdonovan | 115 | 70 |
17470695 | jacobkornbluh | 115 | 32 |
371889510 | TeddyDavisCNN | 112 | 65 |
286998245 | Phil_Lewis_ | 112 | 73 |
1267887043 | RusEmbUSA | 108 | 79 |
20017835 | evanmcmurry | 107 | 59 |
564069706 | NPRKelly | 106 | 59 |
4276158575 | NixonLibrary | 105 | 101 |
78400475 | michikokakutani | 99 | 57 |
297100174 | anneapplebaum | 98 | 61 |
31997610 | AP_CorpComm | 98 | 23 |
20097201 | eorden | 98 | 59 |
21344549 | jonshorman | 98 | 39 |
109369991 | MarkZuckerman | 96 | 23 |
4267082849 | APEastRegion | 96 | 55 |
229599399 | MicahZenko | 94 | 65 |
3889878142 | ChrisSnyderFox | 94 | 23 |
449588356 | Kasparov63 | 93 | 49 |
269314519 | MichaelLaRosaDC | 91 | 56 |
424385350 | APCentralRegion | 91 | 50 |
135173872 | DafnaLinzer | 87 | 61 |
9567972 | CNNnewsroom | 85 | 43 |
16827148 | ChristopherJM | 84 | 32 |
813311743 | NumbersMuncher | 83 | 50 |
47233194 | TreyYingst | 83 | 64 |
14146966 | aravosis | 79 | 29 |
954590804 | planetepics | 79 | 13 |
4091551984 | tribelaw | 79 | 44 |
51263592 | AdamSchefter | 78 | 46 |
474232856 | AP_Planner | 77 | 60 |
539665155 | lyman_brian | 76 | 16 |
3223426134 | SethAbramson | 76 | 28 |
1626294277 | spectatorindex | 75 | 54 |
1754641 | nytimesbusiness | 75 | 31 |
824473943931293697 | RoguePOTUSStaff | 72 | 8 |
788697546 | mgerrydoyle | 72 | 10 |
1222716350 | RCDefense | 71 | 29 |
90275200 | ASLuhn | 71 | 39 |
15111062 | thomaswright08 | 71 | 58 |
92854623 | hannahdreier | 70 | 40 |
1767741 | NYTNational | 69 | 30 |
top_not_known_retweet_df.to_csv('unknown_retweets.csv')