import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Simply the tweet on load
def reply_transform(tweet):
if tweet.get('in_reply_to_status_id'):
return {
'tweet_id': tweet['id_str'],
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'reply_to_user_id': tweet['in_reply_to_user_id_str'],
'reply_to_screen_name': tweet['in_reply_to_screen_name'],
'reply_to_tweet_id': tweet['in_reply_to_status_id_str'],
'tweet_created_at': date_parse(tweet['created_at'])
}
return None
reply_df = load_tweet_df(reply_transform, ['tweet_id', 'user_id', 'screen_name', 'reply_to_user_id',
'reply_to_screen_name', 'reply_to_tweet_id', 'tweet_created_at'])
INFO:root:Loading from tweets/6eea2088e010437da4b6031c2abffdc9_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 DEBUG:root:Loaded 300000 INFO:root:Loading from tweets/a7bcdbde7a104285b92fe26e286f2543_001.json.gz DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 DEBUG:root:Loaded 500000 DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 INFO:root:Loading from tweets/e1c824ff2b3c4c5a9a93a16e5036d09a_001.json.gz DEBUG:root:Loaded 650000 DEBUG:root:Loaded 700000 DEBUG:root:Loaded 750000
reply_df[['reply_to_user_id']].count()
reply_to_user_id 118570 dtype: int64
reply_df.head()
tweet_id | user_id | screen_name | reply_to_user_id | reply_to_screen_name | reply_to_tweet_id | tweet_created_at | |
---|---|---|---|---|---|---|---|
0 | 847428582821449730 | 780221130 | loren_duggan | 140286364 | nielslesniewski | 847424577009369094 | 2017-03-30 12:41:33+00:00 |
1 | 846472179902550017 | 29607664 | adamliptak | 106729916 | espinsegall | 846471674769936384 | 2017-03-27 21:21:09+00:00 |
2 | 846357290018099200 | 29607664 | adamliptak | 147586500 | EdWhelanEPPC | 846356576399212544 | 2017-03-27 13:44:37+00:00 |
3 | 847789885692018690 | 9484732 | amacker | 26117379 | scottpllc | 847046284297031681 | 2017-03-31 12:37:14+00:00 |
4 | 847486491727085568 | 9484732 | amacker | 9484732 | amacker | 847486211174219776 | 2017-03-30 16:31:39+00:00 |
# From the replies, extract map of user ids to screen names
reply_to_user_id_lookup_df = reply_df.loc[reply_df.groupby('reply_to_user_id')['tweet_created_at'].idxmax()].ix[:,['reply_to_user_id', 'reply_to_screen_name']].set_index(['reply_to_user_id'])
reply_to_user_id_lookup_df.count()
reply_to_screen_name 27041 dtype: int64
reply_to_user_id_lookup_df.head()
reply_to_screen_name | |
---|---|
reply_to_user_id | |
100005598 | hotelkeys |
10000772 | JMoLawre |
100025240 | itsbull |
100028531 | Stevempars |
100036032 | Mitch_Tischler |
# From the users (not the mentions), extract map of user ids to screen names
user_id_lookup_df = reply_df.loc[reply_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id'])
user_id_lookup_df.count()
screen_name 1510 dtype: int64
# Group by user_id
# This count should match the user_id map count
reply_to_summary_user_id_df = pd.DataFrame(reply_df.groupby('reply_to_user_id').size(), columns=['reply_to_count'])
reply_to_summary_user_id_df.count()
reply_to_count 27041 dtype: int64
reply_to_summary_user_id_df.head()
reply_to_count | |
---|---|
reply_to_user_id | |
100005598 | 5 |
10000772 | 1 |
100025240 | 1 |
100028531 | 3 |
100036032 | 1 |
# Join with user id map
reply_to_summary_screen_name_df = reply_to_summary_user_id_df.join(reply_to_user_id_lookup_df)
reply_to_summary_screen_name_df.count()
reply_to_count 27041 reply_to_screen_name 27041 dtype: int64
reply_to_summary_screen_name_df.head()
reply_to_count | reply_to_screen_name | |
---|---|---|
reply_to_user_id | ||
100005598 | 5 | hotelkeys |
10000772 | 1 | JMoLawre |
100025240 | 1 | itsbull |
100028531 | 3 | Stevempars |
100036032 | 1 | Mitch_Tischler |
# Load lookups of known users
from utils import load_user_type_lookup_df
user_type_lookup_df = load_user_type_lookup_df()[['type']]
user_type_lookup_df.count()
type 13160 dtype: int64
user_type_lookup_df.head()
type | |
---|---|
user_id | |
2345626885 | journalists |
780221130 | journalists |
285772181 | journalists |
29607664 | journalists |
9484732 | journalists |
user_type_lookup_df['type'].value_counts()
media 4538 journalists 3576 government 3055 politicians 817 ngo 250 pundit 195 other 160 other_political 156 cultural 131 academic 129 business 125 foreign_political 28 Name: type, dtype: int64
# Join the mentions and the known users
reply_to_summary_type_df = reply_to_summary_screen_name_df.join(user_type_lookup_df, how='left')
reply_to_summary_type_df['type'].fillna('unknown', inplace=True)
reply_to_summary_type_df.index.name = 'user_id'
reply_to_summary_type_df.count()
reply_to_count 27041 reply_to_screen_name 27041 type 27041 dtype: int64
reply_to_summary_type_df.head()
reply_to_count | reply_to_screen_name | type | |
---|---|---|---|
user_id | |||
100005598 | 5 | hotelkeys | unknown |
10000772 | 1 | JMoLawre | unknown |
100025240 | 1 | itsbull | unknown |
100028531 | 3 | Stevempars | unknown |
100036032 | 1 | Mitch_Tischler | unknown |
Which is different than the number of replies to.
reply_to_user_id_per_user_df = reply_df[['reply_to_user_id', 'user_id']].drop_duplicates()
reply_to_user_id_per_user_summary_df = pd.DataFrame(reply_to_user_id_per_user_df.groupby('reply_to_user_id').size(), columns=['users_replying_to_count'])
reply_to_user_id_per_user_summary_df.index.name = 'user_id'
# Join with reply_to_summary_type_df
reply_to_summary_df = reply_to_summary_type_df.join(reply_to_user_id_per_user_summary_df)
reply_to_summary_df['percent_of_users_replying_to'] = reply_to_summary_df.users_replying_to_count / user_id_lookup_df['screen_name'].count()
reply_to_summary_df.head()
reply_to_count | reply_to_screen_name | type | users_replying_to_count | percent_of_users_replying_to | |
---|---|---|---|---|---|
user_id | |||||
100005598 | 5 | hotelkeys | unknown | 3 | 0.001987 |
10000772 | 1 | JMoLawre | unknown | 1 | 0.000662 |
100025240 | 1 | itsbull | unknown | 1 | 0.000662 |
100028531 | 3 | Stevempars | unknown | 3 | 0.001987 |
100036032 | 1 | Mitch_Tischler | unknown | 1 | 0.000662 |
For users that made any replies. Also to possible to figure this out for all users.
reply_df['user_id'].value_counts().describe()
count 1510.000000 mean 78.523179 std 290.394805 min 1.000000 25% 3.000000 50% 13.000000 75% 57.000000 max 8009.000000 Name: user_id, dtype: float64
replies_grouped_by_users_replying_to_df = reply_to_summary_df[['reply_to_count', 'users_replying_to_count']].groupby(by='users_replying_to_count').agg([np.sum, np.size])
replies_grouped_by_users_replying_to_df['cumulative_reply_to_count_sum'] = replies_grouped_by_users_replying_to_df['reply_to_count', 'sum'].cumsum()
replies_grouped_by_users_replying_to_df['cumulative_reply_to_count_sum_percentage'] = replies_grouped_by_users_replying_to_df['cumulative_reply_to_count_sum'] / replies_grouped_by_users_replying_to_df['reply_to_count', 'sum'].sum()
replies_grouped_by_users_replying_to_df['cumulative_replied_to_users'] = replies_grouped_by_users_replying_to_df['reply_to_count', 'size'].cumsum()
replies_grouped_by_users_replying_to_df['cumulative_replied_to_users_percentage'] = replies_grouped_by_users_replying_to_df['cumulative_replied_to_users'] / replies_grouped_by_users_replying_to_df['reply_to_count', 'size'].sum()
replies_grouped_by_users_replying_to_df
reply_to_count | cumulative_reply_to_count_sum | cumulative_reply_to_count_sum_percentage | cumulative_replied_to_users | cumulative_replied_to_users_percentage | ||
---|---|---|---|---|---|---|
sum | size | |||||
users_replying_to_count | ||||||
1 | 36339 | 21420 | 36339 | 0.306477 | 21420 | 0.792130 |
2 | 11064 | 2616 | 47403 | 0.399789 | 24036 | 0.888872 |
3 | 6909 | 1045 | 54312 | 0.458059 | 25081 | 0.927517 |
4 | 4561 | 467 | 58873 | 0.496525 | 25548 | 0.944788 |
5 | 4022 | 348 | 62895 | 0.530446 | 25896 | 0.957657 |
6 | 3277 | 206 | 66172 | 0.558084 | 26102 | 0.965275 |
7 | 3144 | 133 | 69316 | 0.584600 | 26235 | 0.970193 |
8 | 2111 | 109 | 71427 | 0.602404 | 26344 | 0.974224 |
9 | 2940 | 88 | 74367 | 0.627199 | 26432 | 0.977479 |
10 | 2926 | 81 | 77293 | 0.651877 | 26513 | 0.980474 |
11 | 1794 | 72 | 79087 | 0.667007 | 26585 | 0.983137 |
12 | 1990 | 58 | 81077 | 0.683790 | 26643 | 0.985282 |
13 | 938 | 31 | 82015 | 0.691701 | 26674 | 0.986428 |
14 | 1214 | 37 | 83229 | 0.701940 | 26711 | 0.987796 |
15 | 1646 | 32 | 84875 | 0.715822 | 26743 | 0.988980 |
16 | 1433 | 25 | 86308 | 0.727908 | 26768 | 0.989904 |
17 | 1814 | 29 | 88122 | 0.743207 | 26797 | 0.990977 |
18 | 1312 | 26 | 89434 | 0.754272 | 26823 | 0.991938 |
19 | 1329 | 25 | 90763 | 0.765480 | 26848 | 0.992863 |
20 | 1299 | 19 | 92062 | 0.776436 | 26867 | 0.993565 |
21 | 1327 | 15 | 93389 | 0.787628 | 26882 | 0.994120 |
22 | 1716 | 19 | 95105 | 0.802100 | 26901 | 0.994823 |
23 | 370 | 7 | 95475 | 0.805221 | 26908 | 0.995082 |
24 | 1513 | 11 | 96988 | 0.817981 | 26919 | 0.995488 |
25 | 730 | 7 | 97718 | 0.824138 | 26926 | 0.995747 |
26 | 478 | 8 | 98196 | 0.828169 | 26934 | 0.996043 |
27 | 1019 | 8 | 99215 | 0.836763 | 26942 | 0.996339 |
28 | 797 | 6 | 100012 | 0.843485 | 26948 | 0.996561 |
29 | 1046 | 10 | 101058 | 0.852307 | 26958 | 0.996931 |
30 | 157 | 3 | 101215 | 0.853631 | 26961 | 0.997042 |
... | ... | ... | ... | ... | ... | ... |
38 | 882 | 7 | 106132 | 0.895100 | 26998 | 0.998410 |
39 | 569 | 3 | 106701 | 0.899899 | 27001 | 0.998521 |
40 | 435 | 1 | 107136 | 0.903568 | 27002 | 0.998558 |
41 | 243 | 1 | 107379 | 0.905617 | 27003 | 0.998595 |
43 | 363 | 3 | 107742 | 0.908678 | 27006 | 0.998706 |
44 | 72 | 1 | 107814 | 0.909286 | 27007 | 0.998743 |
45 | 276 | 2 | 108090 | 0.911613 | 27009 | 0.998817 |
46 | 119 | 1 | 108209 | 0.912617 | 27010 | 0.998854 |
47 | 237 | 1 | 108446 | 0.914616 | 27011 | 0.998891 |
48 | 281 | 2 | 108727 | 0.916986 | 27013 | 0.998965 |
50 | 2744 | 4 | 111471 | 0.940128 | 27017 | 0.999112 |
51 | 371 | 3 | 111842 | 0.943257 | 27020 | 0.999223 |
53 | 224 | 2 | 112066 | 0.945146 | 27022 | 0.999297 |
54 | 121 | 1 | 112187 | 0.946167 | 27023 | 0.999334 |
55 | 370 | 1 | 112557 | 0.949287 | 27024 | 0.999371 |
56 | 150 | 1 | 112707 | 0.950552 | 27025 | 0.999408 |
57 | 124 | 1 | 112831 | 0.951598 | 27026 | 0.999445 |
58 | 213 | 1 | 113044 | 0.953395 | 27027 | 0.999482 |
59 | 1725 | 2 | 114769 | 0.967943 | 27029 | 0.999556 |
60 | 214 | 1 | 114983 | 0.969748 | 27030 | 0.999593 |
62 | 123 | 1 | 115106 | 0.970785 | 27031 | 0.999630 |
63 | 848 | 1 | 115954 | 0.977937 | 27032 | 0.999667 |
64 | 115 | 1 | 116069 | 0.978907 | 27033 | 0.999704 |
65 | 113 | 1 | 116182 | 0.979860 | 27034 | 0.999741 |
67 | 212 | 1 | 116394 | 0.981648 | 27035 | 0.999778 |
71 | 997 | 2 | 117391 | 0.990057 | 27037 | 0.999852 |
72 | 230 | 1 | 117621 | 0.991996 | 27038 | 0.999889 |
76 | 147 | 1 | 117768 | 0.993236 | 27039 | 0.999926 |
78 | 186 | 1 | 117954 | 0.994805 | 27040 | 0.999963 |
94 | 616 | 1 | 118570 | 1.000000 | 27041 | 1.000000 |
67 rows × 6 columns
%matplotlib inline
replies_grouped_by_users_replying_to_df[['cumulative_reply_to_count_sum_percentage', 'cumulative_replied_to_users_percentage']].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1102d0d30>
Removes users that were only replied to by 1 user.
reply_to_summary_df.drop(reply_to_summary_df[reply_to_summary_df.users_replying_to_count == 1].index, inplace=True)
reply_to_summary_df['reply_to_screen_name'].count()
5621
Unknown for type indicates that it is not matched with an known Twitter account.
reply_to_summary_df.sort_values('reply_to_count', ascending=False).head(50)
reply_to_count | reply_to_screen_name | type | users_replying_to_count | percent_of_users_replying_to | |
---|---|---|---|---|---|
user_id | |||||
22891564 | 2373 | chrisgeidner | journalists | 50 | 0.033113 |
3817401 | 1516 | ericgeller | journalists | 59 | 0.039073 |
118130765 | 881 | dylanlscott | journalists | 71 | 0.047020 |
46557945 | 848 | StevenTDennis | journalists | 63 | 0.041722 |
275207082 | 734 | AlexParkerDC | journalists | 24 | 0.015894 |
17466186 | 707 | tomlobianco | journalists | 17 | 0.011258 |
906734342 | 694 | KimberlyRobinsn | journalists | 7 | 0.004636 |
19847765 | 647 | sahilkapur | journalists | 34 | 0.022517 |
398088661 | 616 | MEPFuller | journalists | 94 | 0.062252 |
20176845 | 535 | heathdwilliams | other | 3 | 0.001987 |
493756786 | 512 | amir_anasr | journalists | 22 | 0.014570 |
46555511 | 494 | Alex_Panetta | journalists | 2 | 0.001325 |
26559241 | 491 | fordm | journalists | 27 | 0.017881 |
317980134 | 462 | CraigCaplan | journalists | 9 | 0.005960 |
17907987 | 451 | timkmak | journalists | 29 | 0.019205 |
103016675 | 435 | AaronMehta | journalists | 40 | 0.026490 |
14597239 | 370 | TonyRomm | journalists | 55 | 0.036424 |
47758416 | 359 | marissaaevans | journalists | 2 | 0.001325 |
21696279 | 347 | brianbeutler | journalists | 37 | 0.024503 |
52392666 | 341 | ZoeTillman | journalists | 9 | 0.005960 |
225265639 | 333 | ddale8 | journalists | 16 | 0.010596 |
16285830 | 330 | philewing | journalists | 10 | 0.006623 |
227790723 | 314 | RichardRubinDC | journalists | 39 | 0.025828 |
15146659 | 305 | JSwiftTWS | journalists | 34 | 0.022517 |
23332846 | 299 | mattzap | journalists | 4 | 0.002649 |
90478926 | 290 | MikeSacksEsq | journalists | 21 | 0.013907 |
16061946 | 283 | kelmej | journalists | 21 | 0.013907 |
29771100 | 276 | lawrencehurley | journalists | 25 | 0.016556 |
63717541 | 263 | phillyrich1 | journalists | 2 | 0.001325 |
1337271 | 255 | darth | other | 32 | 0.021192 |
46955476 | 243 | GrahamDavidA | journalists | 41 | 0.027152 |
14362404 | 242 | bradheath | journalists | 10 | 0.006623 |
158072303 | 240 | ValerieInsinna | journalists | 19 | 0.012583 |
16459325 | 237 | ryanbeckwith | journalists | 47 | 0.031126 |
19186003 | 230 | seungminkim | journalists | 72 | 0.047682 |
950531 | 230 | pbump | journalists | 28 | 0.018543 |
22429979 | 222 | nycsouthpaw | pundit | 31 | 0.020530 |
12245632 | 222 | jackshafer | journalists | 38 | 0.025166 |
23664429 | 221 | dnvolz | journalists | 22 | 0.014570 |
46213956 | 218 | JamilSmith | journalists | 7 | 0.004636 |
80111587 | 217 | JeffYoung | journalists | 31 | 0.020530 |
407013776 | 214 | burgessev | journalists | 60 | 0.039735 |
11771512 | 213 | OKnox | journalists | 58 | 0.038411 |
16244449 | 212 | jbarro | journalists | 67 | 0.044371 |
16125224 | 209 | ByronTau | journalists | 59 | 0.039073 |
36607254 | 207 | Oriana0214 | journalists | 20 | 0.013245 |
437019753 | 206 | TimothyNoah1 | journalists | 12 | 0.007947 |
269911034 | 206 | YAppelbaum | journalists | 12 | 0.007947 |
14529929 | 193 | jaketapper | journalists | 39 | 0.025828 |
48120914 | 186 | SopanDeb | journalists | 78 | 0.051656 |
types_by_reply_to_count_df = reply_to_summary_df[['type', 'reply_to_count']].groupby('type').sum()
types_by_reply_to_count_df['type_percentage']= types_by_reply_to_count_df['reply_to_count'] / types_by_reply_to_count_df['reply_to_count'].sum()
types_by_reply_to_count_df.sort_values('reply_to_count', ascending=False)
reply_to_count | type_percentage | |
---|---|---|
type | ||
journalists | 59032 | 0.717880 |
unknown | 12890 | 0.156754 |
pundit | 3276 | 0.039839 |
other_political | 1687 | 0.020515 |
other | 1577 | 0.019178 |
academic | 1195 | 0.014532 |
ngo | 1131 | 0.013754 |
media | 605 | 0.007357 |
politicians | 333 | 0.004050 |
business | 265 | 0.003223 |
cultural | 127 | 0.001544 |
government | 113 | 0.001374 |
Replies by type per user.
reply_all_join_df = pd.merge(reply_df, user_type_lookup_df[['type']], how='left', left_on='reply_to_user_id', right_index=True)
reply_all_join_df['type'].fillna('unknown', inplace=True)
# Drop tail
reply_all_join_df = reply_all_join_df[reply_all_join_df.reply_to_user_id.isin(reply_to_summary_df.index)]
reply_all_join_df.head()
tweet_id | user_id | screen_name | reply_to_user_id | reply_to_screen_name | reply_to_tweet_id | tweet_created_at | type | |
---|---|---|---|---|---|---|---|---|
0 | 847428582821449730 | 780221130 | loren_duggan | 140286364 | nielslesniewski | 847424577009369094 | 2017-03-30 12:41:33+00:00 | journalists |
1 | 846472179902550017 | 29607664 | adamliptak | 106729916 | espinsegall | 846471674769936384 | 2017-03-27 21:21:09+00:00 | academic |
2 | 846357290018099200 | 29607664 | adamliptak | 147586500 | EdWhelanEPPC | 846356576399212544 | 2017-03-27 13:44:37+00:00 | academic |
3 | 847789885692018690 | 9484732 | amacker | 26117379 | scottpllc | 847046284297031681 | 2017-03-31 12:37:14+00:00 | unknown |
4 | 847486491727085568 | 9484732 | amacker | 9484732 | amacker | 847486211174219776 | 2017-03-30 16:31:39+00:00 | journalists |
reply_summary_by_user_df = reply_all_join_df.groupby([reply_all_join_df.user_id, reply_all_join_df.type]).size().unstack().fillna(0)
# Add a total column
reply_summary_by_user_df['total'] = reply_summary_by_user_df.sum(axis=1)
for col_name in reply_summary_by_user_df.columns[:-1]:
reply_summary_by_user_df['{}_percent'.format(col_name)] = reply_summary_by_user_df[col_name] / reply_summary_by_user_df.total
reply_summary_by_user_df.head(10)
type | academic | business | cultural | government | journalists | media | ngo | other | other_political | politicians | ... | cultural_percent | government_percent | journalists_percent | media_percent | ngo_percent | other_percent | other_political_percent | politicians_percent | pundit_percent | unknown_percent |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
100165378 | 0.0 | 0.0 | 13.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 0.52 | 0.000000 | 0.160000 | 0.0 | 0.000000 | 0.0 | 0.04 | 0.04 | 0.000000 | 0.240000 |
1001991865 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.00 | 0.00 | 0.000000 | 1.000000 |
1002229862 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.000000 | 0.750000 | 0.0 | 0.000000 | 0.0 | 0.00 | 0.00 | 0.000000 | 0.250000 |
100802089 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.000000 | 0.333333 | 0.0 | 0.000000 | 0.0 | 0.00 | 0.00 | 0.000000 | 0.666667 |
100860790 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.000000 | 0.500000 | 0.0 | 0.000000 | 0.0 | 0.00 | 0.00 | 0.000000 | 0.500000 |
1009749229 | 2.0 | 0.0 | 0.0 | 2.0 | 73.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.024096 | 0.879518 | 0.0 | 0.048193 | 0.0 | 0.00 | 0.00 | 0.024096 | 0.000000 |
1013785220 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.000000 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.00 | 0.00 | 0.000000 | 0.000000 |
102171691 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.142857 | 0.714286 | 0.0 | 0.000000 | 0.0 | 0.00 | 0.00 | 0.000000 | 0.142857 |
102238997 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.00 | 0.000000 | 0.500000 | 0.0 | 0.000000 | 0.5 | 0.00 | 0.00 | 0.000000 | 0.000000 |
102994740 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.000000 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.00 | 0.00 | 0.000000 | 0.000000 |
10 rows × 25 columns
That is, for each user determine the percent of replies by type. Then take the average of each type.
Thus, this mention analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.)
reply_summary_by_user_df.filter(axis=1, regex="_percent$").mean()
type academic_percent 0.018718 business_percent 0.007318 cultural_percent 0.004393 government_percent 0.004183 journalists_percent 0.620576 media_percent 0.023570 ngo_percent 0.016683 other_percent 0.010210 other_political_percent 0.019884 politicians_percent 0.006863 pundit_percent 0.019340 unknown_percent 0.248261 dtype: float64
The number of users that replied to an account. Thus, each user counts as 1, even if that user made multiple replies to the account.
This weights an account that is replied to a 100 users more heavily than an account that is replied to a 100 times by a single user.
reply_to_summary_df.sort_values('users_replying_to_count', ascending=False).head(20)
reply_to_count | reply_to_screen_name | type | users_replying_to_count | percent_of_users_replying_to | |
---|---|---|---|---|---|
user_id | |||||
398088661 | 616 | MEPFuller | journalists | 94 | 0.062252 |
48120914 | 186 | SopanDeb | journalists | 78 | 0.051656 |
93069110 | 147 | maggieNYT | journalists | 76 | 0.050331 |
19186003 | 230 | seungminkim | journalists | 72 | 0.047682 |
118130765 | 881 | dylanlscott | journalists | 71 | 0.047020 |
13524182 | 116 | daveweigel | journalists | 71 | 0.047020 |
16244449 | 212 | jbarro | journalists | 67 | 0.044371 |
14412533 | 113 | CillizzaCNN | journalists | 65 | 0.043046 |
19107878 | 115 | GlennThrush | journalists | 64 | 0.042384 |
46557945 | 848 | StevenTDennis | journalists | 63 | 0.041722 |
218325695 | 123 | Bencjacobs | journalists | 62 | 0.041060 |
407013776 | 214 | burgessev | journalists | 60 | 0.039735 |
16125224 | 209 | ByronTau | journalists | 59 | 0.039073 |
3817401 | 1516 | ericgeller | journalists | 59 | 0.039073 |
11771512 | 213 | OKnox | journalists | 58 | 0.038411 |
51462013 | 124 | lizzieohreally | journalists | 57 | 0.037748 |
217550862 | 150 | BresPolitico | journalists | 56 | 0.037086 |
14597239 | 370 | TonyRomm | journalists | 55 | 0.036424 |
4207961 | 121 | chrislhayes | journalists | 54 | 0.035762 |
326255267 | 89 | KFILE | journalists | 53 | 0.035099 |
types_by_users_replying_to_df = reply_to_summary_df[['type', 'users_replying_to_count']].groupby('type').sum()
types_by_users_replying_to_df['type_percentage']= types_by_users_replying_to_df['users_replying_to_count'] / types_by_users_replying_to_df['users_replying_to_count'].sum()
types_by_users_replying_to_df.sort_values('users_replying_to_count', ascending=False)
users_replying_to_count | type_percentage | |
---|---|---|
type | ||
journalists | 15913 | 0.575682 |
unknown | 7644 | 0.276536 |
pundit | 998 | 0.036104 |
other_political | 883 | 0.031944 |
academic | 510 | 0.018450 |
media | 453 | 0.016388 |
ngo | 451 | 0.016316 |
other | 301 | 0.010889 |
politicians | 203 | 0.007344 |
business | 133 | 0.004812 |
cultural | 79 | 0.002858 |
government | 74 | 0.002677 |
Remember, the tail has been cut off
reply_to_summary_df[reply_to_summary_df.type == 'unknown'].count()
reply_to_count 3120 reply_to_screen_name 3120 type 3120 users_replying_to_count 3120 percent_of_users_replying_to 3120 dtype: int64
reply_to_summary_df[reply_to_summary_df.type != 'unknown'].count()
reply_to_count 2501 reply_to_screen_name 2501 type 2501 users_replying_to_count 2501 percent_of_users_replying_to 2501 dtype: int64
top_not_known_reply_to_df = reply_to_summary_df[(reply_to_summary_df.type == 'unknown') & (reply_to_summary_df.users_replying_to_count >= 5)].sort_values('reply_to_count', ascending=False)[['reply_to_screen_name', 'reply_to_count', 'users_replying_to_count']]
top_not_known_reply_to_df.head(50)
reply_to_screen_name | reply_to_count | users_replying_to_count | |
---|---|---|---|
user_id | |||
1173121356 | xenocryptsite | 17 | 5 |
18111042 | michaelpfreeman | 11 | 5 |
39100192 | Southfive | 11 | 5 |
415794979 | AndStrats | 10 | 5 |
166207886 | BrianLaslie | 10 | 5 |
4440118883 | luke_j_obrien | 10 | 6 |
48585729 | cam_mason | 10 | 6 |
14372270 | mcbyrne | 9 | 5 |
338164741 | bsdtectr | 9 | 5 |
55038792 | ELSchillinger | 9 | 5 |
152145921 | jason_howerton | 9 | 5 |
15111062 | thomaswright08 | 9 | 5 |
14668111 | BGrueskin | 9 | 5 |
8790702 | dhm | 9 | 6 |
80669530 | bungdan | 9 | 6 |
593909348 | mis2127 | 9 | 7 |
2936965923 | RobertKYarbro | 9 | 5 |
6576292 | jhaverly | 9 | 7 |
6931262 | rachsyme | 9 | 5 |
21700839 | BharatKrishnan | 8 | 6 |
19087309 | AlexKoppelman | 8 | 5 |
1922583464 | SpectatrCitizen | 8 | 5 |
26377458 | pcdunham | 8 | 6 |
241280143 | CommsDirector | 8 | 5 |
20097201 | eorden | 8 | 6 |
34643610 | EricBoehlert | 8 | 7 |
29090846 | _Drew_McCoy_ | 8 | 6 |
23141473 | calvinstowell | 8 | 5 |
343063239 | Carter_PE | 8 | 7 |
21093964 | TiffanyHaverly | 8 | 6 |
392705809 | econwonk | 8 | 6 |
32071013 | DeanClancy | 8 | 5 |
166782000 | henrycobb | 8 | 7 |
56701775 | DavidRutz | 8 | 5 |
59133139 | keithcrc | 8 | 7 |
108338399 | lukeoneil47 | 8 | 6 |
24972610 | K_Schallhorn | 8 | 5 |
15826886 | CarolBlymire | 8 | 7 |
71569841 | JoshSchwerin | 8 | 7 |
4120521028 | scoejarborough | 8 | 5 |
14364006 | jrosenbaum | 8 | 6 |
278175882 | leximccammond | 7 | 5 |
8475532 | sdkstl | 7 | 5 |
61664932 | jasonahart | 7 | 5 |
40804509 | matthewjsinger | 7 | 6 |
296513648 | jdubya65 | 7 | 6 |
51639553 | sdjacksondc | 7 | 5 |
475957325 | DamonLinker | 7 | 5 |
47020338 | VincentMorris | 7 | 5 |
384841636 | hash_said | 7 | 6 |
top_not_known_reply_to_df.to_csv('unknown_replies.csv')