Replies¶

Data prep¶

Load the data and count.¶

In [1]:

import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Simply the tweet on load
def reply_transform(tweet):
    if tweet.get('in_reply_to_status_id'):
        return {
            'tweet_id': tweet['id_str'],
            'user_id': tweet['user']['id_str'],
            'screen_name': tweet['user']['screen_name'],
            'reply_to_user_id': tweet['in_reply_to_user_id_str'],
            'reply_to_screen_name': tweet['in_reply_to_screen_name'],
            'reply_to_tweet_id': tweet['in_reply_to_status_id_str'],
            'tweet_created_at': date_parse(tweet['created_at'])            
        }
    return None

reply_df = load_tweet_df(reply_transform, ['tweet_id', 'user_id', 'screen_name', 'reply_to_user_id',
                                           'reply_to_screen_name', 'reply_to_tweet_id', 'tweet_created_at'])

INFO:root:Loading from tweets/6eea2088e010437da4b6031c2abffdc9_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
INFO:root:Loading from tweets/a7bcdbde7a104285b92fe26e286f2543_001.json.gz
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
INFO:root:Loading from tweets/e1c824ff2b3c4c5a9a93a16e5036d09a_001.json.gz
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000

Number of replies found in the dataset¶

In [2]:

reply_df[['reply_to_user_id']].count()

Out[2]:

reply_to_user_id    118570
dtype: int64

The reply data¶

In [3]:

reply_df.head()

Out[3]:

	tweet_id	user_id	screen_name	reply_to_user_id	reply_to_screen_name	reply_to_tweet_id	tweet_created_at
0	847428582821449730	780221130	loren_duggan	140286364	nielslesniewski	847424577009369094	2017-03-30 12:41:33+00:00
1	846472179902550017	29607664	adamliptak	106729916	espinsegall	846471674769936384	2017-03-27 21:21:09+00:00
2	846357290018099200	29607664	adamliptak	147586500	EdWhelanEPPC	846356576399212544	2017-03-27 13:44:37+00:00
3	847789885692018690	9484732	amacker	26117379	scottpllc	847046284297031681	2017-03-31 12:37:14+00:00
4	847486491727085568	9484732	amacker	9484732	amacker	847486211174219776	2017-03-30 16:31:39+00:00

Create lookup of replied user ids to screen names¶

In [4]:

# From the replies, extract map of user ids to screen names
reply_to_user_id_lookup_df = reply_df.loc[reply_df.groupby('reply_to_user_id')['tweet_created_at'].idxmax()].ix[:,['reply_to_user_id', 'reply_to_screen_name']].set_index(['reply_to_user_id'])
reply_to_user_id_lookup_df.count()

Out[4]:

reply_to_screen_name    27041
dtype: int64

In [5]:

reply_to_user_id_lookup_df.head()

Out[5]:

	reply_to_screen_name
reply_to_user_id
100005598	hotelkeys
10000772	JMoLawre
100025240	itsbull
100028531	Stevempars
100036032	Mitch_Tischler

Create lookup of user ids to screen names¶

In [6]:

# From the users (not the mentions), extract map of user ids to screen names
user_id_lookup_df = reply_df.loc[reply_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id'])
user_id_lookup_df.count()

Out[6]:

screen_name    1510
dtype: int64

Group replies by reply to user id¶

In [7]:

# Group by user_id
# This count should match the user_id map count
reply_to_summary_user_id_df = pd.DataFrame(reply_df.groupby('reply_to_user_id').size(), columns=['reply_to_count'])
reply_to_summary_user_id_df.count()

Out[7]:

reply_to_count    27041
dtype: int64

In [8]:

reply_to_summary_user_id_df.head()

Out[8]:

	reply_to_count
reply_to_user_id
100005598	5
10000772	1
100025240	1
100028531	3
100036032	1

Add back in the reply to screen names¶

In [9]:

# Join with user id map
reply_to_summary_screen_name_df = reply_to_summary_user_id_df.join(reply_to_user_id_lookup_df)
reply_to_summary_screen_name_df.count()

Out[9]:

reply_to_count          27041
reply_to_screen_name    27041
dtype: int64

In [10]:

reply_to_summary_screen_name_df.head()

Out[10]:

	reply_to_count	reply_to_screen_name
reply_to_user_id
100005598	5	hotelkeys
10000772	1	JMoLawre
100025240	1	itsbull
100028531	3	Stevempars
100036032	1	Mitch_Tischler

Add users types for replies to¶

In [11]:

# Load lookups of known users
from utils import load_user_type_lookup_df

user_type_lookup_df = load_user_type_lookup_df()[['type']]
user_type_lookup_df.count()

Out[11]:

type    13160
dtype: int64

In [12]:

user_type_lookup_df.head()

Out[12]:

	type
user_id
2345626885	journalists
780221130	journalists
285772181	journalists
29607664	journalists
9484732	journalists

In [13]:

user_type_lookup_df['type'].value_counts()

Out[13]:

media                4538
journalists          3576
government           3055
politicians           817
ngo                   250
pundit                195
other                 160
other_political       156
cultural              131
academic              129
business              125
foreign_political      28
Name: type, dtype: int64

In [14]:

# Join the mentions and the known users
reply_to_summary_type_df = reply_to_summary_screen_name_df.join(user_type_lookup_df, how='left')
reply_to_summary_type_df['type'].fillna('unknown', inplace=True)
reply_to_summary_type_df.index.name = 'user_id'
reply_to_summary_type_df.count()

Out[14]:

reply_to_count          27041
reply_to_screen_name    27041
type                    27041
dtype: int64

In [15]:

reply_to_summary_type_df.head()

Out[15]:

	reply_to_count	reply_to_screen_name	type
user_id
100005598	5	hotelkeys	unknown
10000772	1	JMoLawre	unknown
100025240	1	itsbull	unknown
100028531	3	Stevempars	unknown
100036032	1	Mitch_Tischler	unknown

Add number of users replying to¶

Which is different than the number of replies to.

In [16]:

reply_to_user_id_per_user_df = reply_df[['reply_to_user_id', 'user_id']].drop_duplicates()
reply_to_user_id_per_user_summary_df = pd.DataFrame(reply_to_user_id_per_user_df.groupby('reply_to_user_id').size(), columns=['users_replying_to_count'])
reply_to_user_id_per_user_summary_df.index.name = 'user_id'
# Join with reply_to_summary_type_df
reply_to_summary_df = reply_to_summary_type_df.join(reply_to_user_id_per_user_summary_df)
reply_to_summary_df['percent_of_users_replying_to'] = reply_to_summary_df.users_replying_to_count / user_id_lookup_df['screen_name'].count()
reply_to_summary_df.head()

Out[16]:

	reply_to_count	reply_to_screen_name	type	users_replying_to_count	percent_of_users_replying_to
user_id
100005598	5	hotelkeys	unknown	3	0.001987
10000772	1	JMoLawre	unknown	1	0.000662
100025240	1	itsbull	unknown	1	0.000662
100028531	3	Stevempars	unknown	3	0.001987
100036032	1	Mitch_Tischler	unknown	1	0.000662

Reply summary¶

Replies per user¶

For users that made any replies. Also to possible to figure this out for all users.

In [17]:

reply_df['user_id'].value_counts().describe()

Out[17]:

count    1510.000000
mean       78.523179
std       290.394805
min         1.000000
25%         3.000000
50%        13.000000
75%        57.000000
max      8009.000000
Name: user_id, dtype: float64

How long is the tail?¶

In [18]:

replies_grouped_by_users_replying_to_df = reply_to_summary_df[['reply_to_count', 'users_replying_to_count']].groupby(by='users_replying_to_count').agg([np.sum, np.size])
replies_grouped_by_users_replying_to_df['cumulative_reply_to_count_sum'] = replies_grouped_by_users_replying_to_df['reply_to_count', 'sum'].cumsum()
replies_grouped_by_users_replying_to_df['cumulative_reply_to_count_sum_percentage'] = replies_grouped_by_users_replying_to_df['cumulative_reply_to_count_sum'] / replies_grouped_by_users_replying_to_df['reply_to_count', 'sum'].sum()
replies_grouped_by_users_replying_to_df['cumulative_replied_to_users'] = replies_grouped_by_users_replying_to_df['reply_to_count', 'size'].cumsum()
replies_grouped_by_users_replying_to_df['cumulative_replied_to_users_percentage'] = replies_grouped_by_users_replying_to_df['cumulative_replied_to_users'] / replies_grouped_by_users_replying_to_df['reply_to_count', 'size'].sum()
replies_grouped_by_users_replying_to_df

Out[18]:

	reply_to_count		cumulative_reply_to_count_sum	cumulative_reply_to_count_sum_percentage	cumulative_replied_to_users	cumulative_replied_to_users_percentage
	sum	size
users_replying_to_count
1	36339	21420	36339	0.306477	21420	0.792130
2	11064	2616	47403	0.399789	24036	0.888872
3	6909	1045	54312	0.458059	25081	0.927517
4	4561	467	58873	0.496525	25548	0.944788
5	4022	348	62895	0.530446	25896	0.957657
6	3277	206	66172	0.558084	26102	0.965275
7	3144	133	69316	0.584600	26235	0.970193
8	2111	109	71427	0.602404	26344	0.974224
9	2940	88	74367	0.627199	26432	0.977479
10	2926	81	77293	0.651877	26513	0.980474
11	1794	72	79087	0.667007	26585	0.983137
12	1990	58	81077	0.683790	26643	0.985282
13	938	31	82015	0.691701	26674	0.986428
14	1214	37	83229	0.701940	26711	0.987796
15	1646	32	84875	0.715822	26743	0.988980
16	1433	25	86308	0.727908	26768	0.989904
17	1814	29	88122	0.743207	26797	0.990977
18	1312	26	89434	0.754272	26823	0.991938
19	1329	25	90763	0.765480	26848	0.992863
20	1299	19	92062	0.776436	26867	0.993565
21	1327	15	93389	0.787628	26882	0.994120
22	1716	19	95105	0.802100	26901	0.994823
23	370	7	95475	0.805221	26908	0.995082
24	1513	11	96988	0.817981	26919	0.995488
25	730	7	97718	0.824138	26926	0.995747
26	478	8	98196	0.828169	26934	0.996043
27	1019	8	99215	0.836763	26942	0.996339
28	797	6	100012	0.843485	26948	0.996561
29	1046	10	101058	0.852307	26958	0.996931
30	157	3	101215	0.853631	26961	0.997042
...	...	...	...	...	...	...
38	882	7	106132	0.895100	26998	0.998410
39	569	3	106701	0.899899	27001	0.998521
40	435	1	107136	0.903568	27002	0.998558
41	243	1	107379	0.905617	27003	0.998595
43	363	3	107742	0.908678	27006	0.998706
44	72	1	107814	0.909286	27007	0.998743
45	276	2	108090	0.911613	27009	0.998817
46	119	1	108209	0.912617	27010	0.998854
47	237	1	108446	0.914616	27011	0.998891
48	281	2	108727	0.916986	27013	0.998965
50	2744	4	111471	0.940128	27017	0.999112
51	371	3	111842	0.943257	27020	0.999223
53	224	2	112066	0.945146	27022	0.999297
54	121	1	112187	0.946167	27023	0.999334
55	370	1	112557	0.949287	27024	0.999371
56	150	1	112707	0.950552	27025	0.999408
57	124	1	112831	0.951598	27026	0.999445
58	213	1	113044	0.953395	27027	0.999482
59	1725	2	114769	0.967943	27029	0.999556
60	214	1	114983	0.969748	27030	0.999593
62	123	1	115106	0.970785	27031	0.999630
63	848	1	115954	0.977937	27032	0.999667
64	115	1	116069	0.978907	27033	0.999704
65	113	1	116182	0.979860	27034	0.999741
67	212	1	116394	0.981648	27035	0.999778
71	997	2	117391	0.990057	27037	0.999852
72	230	1	117621	0.991996	27038	0.999889
76	147	1	117768	0.993236	27039	0.999926
78	186	1	117954	0.994805	27040	0.999963
94	616	1	118570	1.000000	27041	1.000000

67 rows × 6 columns

In [19]:

%matplotlib inline
replies_grouped_by_users_replying_to_df[['cumulative_reply_to_count_sum_percentage', 'cumulative_replied_to_users_percentage']].plot()

Out[19]:

<matplotlib.axes._subplots.AxesSubplot at 0x1102d0d30>

Cut off the tail.¶

Removes users that were only replied to by 1 user.

In [20]:

reply_to_summary_df.drop(reply_to_summary_df[reply_to_summary_df.users_replying_to_count == 1].index, inplace=True)
reply_to_summary_df['reply_to_screen_name'].count()

Out[20]:

Approach 1: By reply to count¶

Top accounts (by reply to count)¶

Unknown for type indicates that it is not matched with an known Twitter account.

In [21]:

reply_to_summary_df.sort_values('reply_to_count', ascending=False).head(50)

Out[21]:

	reply_to_count	reply_to_screen_name	type	users_replying_to_count	percent_of_users_replying_to
user_id
22891564	2373	chrisgeidner	journalists	50	0.033113
3817401	1516	ericgeller	journalists	59	0.039073
118130765	881	dylanlscott	journalists	71	0.047020
46557945	848	StevenTDennis	journalists	63	0.041722
275207082	734	AlexParkerDC	journalists	24	0.015894
17466186	707	tomlobianco	journalists	17	0.011258
906734342	694	KimberlyRobinsn	journalists	7	0.004636
19847765	647	sahilkapur	journalists	34	0.022517
398088661	616	MEPFuller	journalists	94	0.062252
20176845	535	heathdwilliams	other	3	0.001987
493756786	512	amir_anasr	journalists	22	0.014570
46555511	494	Alex_Panetta	journalists	2	0.001325
26559241	491	fordm	journalists	27	0.017881
317980134	462	CraigCaplan	journalists	9	0.005960
17907987	451	timkmak	journalists	29	0.019205
103016675	435	AaronMehta	journalists	40	0.026490
14597239	370	TonyRomm	journalists	55	0.036424
47758416	359	marissaaevans	journalists	2	0.001325
21696279	347	brianbeutler	journalists	37	0.024503
52392666	341	ZoeTillman	journalists	9	0.005960
225265639	333	ddale8	journalists	16	0.010596
16285830	330	philewing	journalists	10	0.006623
227790723	314	RichardRubinDC	journalists	39	0.025828
15146659	305	JSwiftTWS	journalists	34	0.022517
23332846	299	mattzap	journalists	4	0.002649
90478926	290	MikeSacksEsq	journalists	21	0.013907
16061946	283	kelmej	journalists	21	0.013907
29771100	276	lawrencehurley	journalists	25	0.016556
63717541	263	phillyrich1	journalists	2	0.001325
1337271	255	darth	other	32	0.021192
46955476	243	GrahamDavidA	journalists	41	0.027152
14362404	242	bradheath	journalists	10	0.006623
158072303	240	ValerieInsinna	journalists	19	0.012583
16459325	237	ryanbeckwith	journalists	47	0.031126
19186003	230	seungminkim	journalists	72	0.047682
950531	230	pbump	journalists	28	0.018543
22429979	222	nycsouthpaw	pundit	31	0.020530
12245632	222	jackshafer	journalists	38	0.025166
23664429	221	dnvolz	journalists	22	0.014570
46213956	218	JamilSmith	journalists	7	0.004636
80111587	217	JeffYoung	journalists	31	0.020530
407013776	214	burgessev	journalists	60	0.039735
11771512	213	OKnox	journalists	58	0.038411
16244449	212	jbarro	journalists	67	0.044371
16125224	209	ByronTau	journalists	59	0.039073
36607254	207	Oriana0214	journalists	20	0.013245
437019753	206	TimothyNoah1	journalists	12	0.007947
269911034	206	YAppelbaum	journalists	12	0.007947
14529929	193	jaketapper	journalists	39	0.025828
48120914	186	SopanDeb	journalists	78	0.051656

Account types (by reply to count)¶

In [22]:

types_by_reply_to_count_df = reply_to_summary_df[['type', 'reply_to_count']].groupby('type').sum()
types_by_reply_to_count_df['type_percentage']= types_by_reply_to_count_df['reply_to_count'] / types_by_reply_to_count_df['reply_to_count'].sum()
types_by_reply_to_count_df.sort_values('reply_to_count', ascending=False)

Out[22]:

	reply_to_count	type_percentage
type
journalists	59032	0.717880
unknown	12890	0.156754
pundit	3276	0.039839
other_political	1687	0.020515
other	1577	0.019178
academic	1195	0.014532
ngo	1131	0.013754
media	605	0.007357
politicians	333	0.004050
business	265	0.003223
cultural	127	0.001544
government	113	0.001374

Approach 2: Per user¶

Replies by type per user.

Add type by merging screen name lookup¶

In [23]:

reply_all_join_df = pd.merge(reply_df, user_type_lookup_df[['type']], how='left', left_on='reply_to_user_id', right_index=True)
reply_all_join_df['type'].fillna('unknown', inplace=True)
# Drop tail
reply_all_join_df = reply_all_join_df[reply_all_join_df.reply_to_user_id.isin(reply_to_summary_df.index)]
reply_all_join_df.head()

Out[23]:

	tweet_id	user_id	screen_name	reply_to_user_id	reply_to_screen_name	reply_to_tweet_id	tweet_created_at	type
0	847428582821449730	780221130	loren_duggan	140286364	nielslesniewski	847424577009369094	2017-03-30 12:41:33+00:00	journalists
1	846472179902550017	29607664	adamliptak	106729916	espinsegall	846471674769936384	2017-03-27 21:21:09+00:00	academic
2	846357290018099200	29607664	adamliptak	147586500	EdWhelanEPPC	846356576399212544	2017-03-27 13:44:37+00:00	academic
3	847789885692018690	9484732	amacker	26117379	scottpllc	847046284297031681	2017-03-31 12:37:14+00:00	unknown
4	847486491727085568	9484732	amacker	9484732	amacker	847486211174219776	2017-03-30 16:31:39+00:00	journalists

In [24]:

reply_summary_by_user_df = reply_all_join_df.groupby([reply_all_join_df.user_id, reply_all_join_df.type]).size().unstack().fillna(0)
# Add a total column
reply_summary_by_user_df['total'] = reply_summary_by_user_df.sum(axis=1)
for col_name in reply_summary_by_user_df.columns[:-1]:
    reply_summary_by_user_df['{}_percent'.format(col_name)] = reply_summary_by_user_df[col_name] / reply_summary_by_user_df.total
reply_summary_by_user_df.head(10)

Out[24]:

type	academic	business	cultural	government	journalists	media	ngo	other	other_political	politicians	...	cultural_percent	government_percent	journalists_percent	media_percent	ngo_percent	other_percent	other_political_percent	politicians_percent	pundit_percent	unknown_percent
user_id
100165378	0.0	0.0	13.0	0.0	4.0	0.0	0.0	0.0	1.0	1.0	...	0.52	0.000000	0.160000	0.0	0.000000	0.0	0.04	0.04	0.000000	0.240000
1001991865	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.00	0.000000	0.000000	0.0	0.000000	0.0	0.00	0.00	0.000000	1.000000
1002229862	0.0	0.0	0.0	0.0	3.0	0.0	0.0	0.0	0.0	0.0	...	0.00	0.000000	0.750000	0.0	0.000000	0.0	0.00	0.00	0.000000	0.250000
100802089	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	...	0.00	0.000000	0.333333	0.0	0.000000	0.0	0.00	0.00	0.000000	0.666667
100860790	0.0	0.0	0.0	0.0	4.0	0.0	0.0	0.0	0.0	0.0	...	0.00	0.000000	0.500000	0.0	0.000000	0.0	0.00	0.00	0.000000	0.500000
1009749229	2.0	0.0	0.0	2.0	73.0	0.0	4.0	0.0	0.0	0.0	...	0.00	0.024096	0.879518	0.0	0.048193	0.0	0.00	0.00	0.024096	0.000000
1013785220	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	...	0.00	0.000000	1.000000	0.0	0.000000	0.0	0.00	0.00	0.000000	0.000000
102171691	0.0	0.0	0.0	1.0	5.0	0.0	0.0	0.0	0.0	0.0	...	0.00	0.142857	0.714286	0.0	0.000000	0.0	0.00	0.00	0.000000	0.142857
102238997	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	...	0.00	0.000000	0.500000	0.0	0.000000	0.5	0.00	0.00	0.000000	0.000000
102994740	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	...	0.00	0.000000	1.000000	0.0	0.000000	0.0	0.00	0.00	0.000000	0.000000

10 rows × 25 columns

Average of percent of replies by type for each user¶

That is, for each user determine the percent of replies by type. Then take the average of each type.

Thus, this mention analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.)

In [25]:

reply_summary_by_user_df.filter(axis=1, regex="_percent$").mean()

Out[25]:

type
academic_percent           0.018718
business_percent           0.007318
cultural_percent           0.004393
government_percent         0.004183
journalists_percent        0.620576
media_percent              0.023570
ngo_percent                0.016683
other_percent              0.010210
other_political_percent    0.019884
politicians_percent        0.006863
pundit_percent             0.019340
unknown_percent            0.248261
dtype: float64

Approach 3: By count of users replying to¶

The number of users that replied to an account. Thus, each user counts as 1, even if that user made multiple replies to the account.

This weights an account that is replied to a 100 users more heavily than an account that is replied to a 100 times by a single user.

In [26]:

reply_to_summary_df.sort_values('users_replying_to_count', ascending=False).head(20)

Out[26]:

	reply_to_count	reply_to_screen_name	type	users_replying_to_count	percent_of_users_replying_to
user_id
398088661	616	MEPFuller	journalists	94	0.062252
48120914	186	SopanDeb	journalists	78	0.051656
93069110	147	maggieNYT	journalists	76	0.050331
19186003	230	seungminkim	journalists	72	0.047682
118130765	881	dylanlscott	journalists	71	0.047020
13524182	116	daveweigel	journalists	71	0.047020
16244449	212	jbarro	journalists	67	0.044371
14412533	113	CillizzaCNN	journalists	65	0.043046
19107878	115	GlennThrush	journalists	64	0.042384
46557945	848	StevenTDennis	journalists	63	0.041722
218325695	123	Bencjacobs	journalists	62	0.041060
407013776	214	burgessev	journalists	60	0.039735
16125224	209	ByronTau	journalists	59	0.039073
3817401	1516	ericgeller	journalists	59	0.039073
11771512	213	OKnox	journalists	58	0.038411
51462013	124	lizzieohreally	journalists	57	0.037748
217550862	150	BresPolitico	journalists	56	0.037086
14597239	370	TonyRomm	journalists	55	0.036424
4207961	121	chrislhayes	journalists	54	0.035762
326255267	89	KFILE	journalists	53	0.035099

Account types (by count of users replying to)¶

In [27]:

types_by_users_replying_to_df = reply_to_summary_df[['type', 'users_replying_to_count']].groupby('type').sum()
types_by_users_replying_to_df['type_percentage']= types_by_users_replying_to_df['users_replying_to_count'] / types_by_users_replying_to_df['users_replying_to_count'].sum()
types_by_users_replying_to_df.sort_values('users_replying_to_count', ascending=False)

Out[27]:

	users_replying_to_count	type_percentage
type
journalists	15913	0.575682
unknown	7644	0.276536
pundit	998	0.036104
other_political	883	0.031944
academic	510	0.018450
media	453	0.016388
ngo	451	0.016316
other	301	0.010889
politicians	203	0.007344
business	133	0.004812
cultural	79	0.002858
government	74	0.002677

Unknown accounts¶

Remember, the tail has been cut off

Number of unknown accounts¶

In [28]:

 reply_to_summary_df[reply_to_summary_df.type == 'unknown'].count()

Out[28]:

reply_to_count                  3120
reply_to_screen_name            3120
type                            3120
users_replying_to_count         3120
percent_of_users_replying_to    3120
dtype: int64

Number of known accounts¶

In [29]:

 reply_to_summary_df[reply_to_summary_df.type != 'unknown'].count()

Out[29]:

reply_to_count                  2501
reply_to_screen_name            2501
type                            2501
users_replying_to_count         2501
percent_of_users_replying_to    2501
dtype: int64

Top unknown by reploy to count that are replied to by at least 5 users¶

In [30]:

top_not_known_reply_to_df = reply_to_summary_df[(reply_to_summary_df.type == 'unknown') & (reply_to_summary_df.users_replying_to_count >= 5)].sort_values('reply_to_count', ascending=False)[['reply_to_screen_name', 'reply_to_count', 'users_replying_to_count']]
top_not_known_reply_to_df.head(50)

Out[30]:

	reply_to_screen_name	reply_to_count	users_replying_to_count
user_id
1173121356	xenocryptsite	17	5
18111042	michaelpfreeman	11	5
39100192	Southfive	11	5
415794979	AndStrats	10	5
166207886	BrianLaslie	10	5
4440118883	luke_j_obrien	10	6
48585729	cam_mason	10	6
14372270	mcbyrne	9	5
338164741	bsdtectr	9	5
55038792	ELSchillinger	9	5
152145921	jason_howerton	9	5
15111062	thomaswright08	9	5
14668111	BGrueskin	9	5
8790702	dhm	9	6
80669530	bungdan	9	6
593909348	mis2127	9	7
2936965923	RobertKYarbro	9	5
6576292	jhaverly	9	7
6931262	rachsyme	9	5
21700839	BharatKrishnan	8	6
19087309	AlexKoppelman	8	5
1922583464	SpectatrCitizen	8	5
26377458	pcdunham	8	6
241280143	CommsDirector	8	5
20097201	eorden	8	6
34643610	EricBoehlert	8	7
29090846	_Drew_McCoy_	8	6
23141473	calvinstowell	8	5
343063239	Carter_PE	8	7
21093964	TiffanyHaverly	8	6
392705809	econwonk	8	6
32071013	DeanClancy	8	5
166782000	henrycobb	8	7
56701775	DavidRutz	8	5
59133139	keithcrc	8	7
108338399	lukeoneil47	8	6
24972610	K_Schallhorn	8	5
15826886	CarolBlymire	8	7
71569841	JoshSchwerin	8	7
4120521028	scoejarborough	8	5
14364006	jrosenbaum	8	6
278175882	leximccammond	7	5
8475532	sdkstl	7	5
61664932	jasonahart	7	5
40804509	matthewjsinger	7	6
296513648	jdubya65	7	6
51639553	sdjacksondc	7	5
475957325	DamonLinker	7	5
47020338	VincentMorris	7	5
384841636	hash_said	7	6

Write top accounts to file¶

In [31]:

top_not_known_reply_to_df.to_csv('unknown_replies.csv')

In [ ]: