%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format
def tweet_transform(tweet):
return {
'tweet_id': tweet['id_str'],
'tweet_created_at': date_parse(tweet['created_at']),
'user_id': tweet['user']['id_str'],
'screen_name': tweet['user']['screen_name'],
'user_created_at': date_parse(tweet['user']['created_at']),
'tweets_to_date': tweet['user']['statuses_count'],
'tweet_type': tweet_type(tweet)
}
tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at',
'user_created_at', 'tweets_to_date', 'tweet_type'])
tweet_df.count()
INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_001.json.gz DEBUG:root:Loaded 50000 DEBUG:root:Loaded 100000 DEBUG:root:Loaded 150000 DEBUG:root:Loaded 200000 DEBUG:root:Loaded 250000 INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_002.json.gz INFO:root:Loading from tweets/7bff8603fb4a49d5953197361d548346_001.json.gz DEBUG:root:Loaded 300000 DEBUG:root:Loaded 350000 DEBUG:root:Loaded 400000 DEBUG:root:Loaded 450000 INFO:root:Loading from tweets/b3f330f5b6cc4572b6d7dabc3752b2b9_001.json.gz DEBUG:root:Loaded 500000 DEBUG:root:Loaded 550000 DEBUG:root:Loaded 600000 DEBUG:root:Loaded 650000
tweet_id 650350 user_id 650350 screen_name 650350 tweet_created_at 650350 user_created_at 650350 tweets_to_date 650350 tweet_type 650350 dtype: int64
tweet_df.head()
tweet_id | user_id | screen_name | tweet_created_at | user_created_at | tweets_to_date | tweet_type | |
---|---|---|---|---|---|---|---|
0 | 859463382042378240 | 2343897943 | AmberCStrong | 2017-05-02 17:43:32+00:00 | 2014-02-14 17:33:36+00:00 | 1701 | original |
1 | 859803200152588288 | 307982591 | JaxAlemany | 2017-05-03 16:13:51+00:00 | 2011-05-30 16:43:13+00:00 | 6328 | original |
2 | 859788527705493504 | 307982591 | JaxAlemany | 2017-05-03 15:15:33+00:00 | 2011-05-30 16:43:13+00:00 | 6328 | quote |
3 | 859788479076732930 | 307982591 | JaxAlemany | 2017-05-03 15:15:22+00:00 | 2011-05-30 16:43:13+00:00 | 6328 | original |
4 | 859781841955500032 | 307982591 | JaxAlemany | 2017-05-03 14:48:59+00:00 | 2011-05-30 16:43:13+00:00 | 6328 | retweet |
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet
user_tweet_count_df['tweets_in_dataset_bin'] = pd.qcut(user_tweet_count_df.tweets_in_dataset, [0, .9, .99, 1.], labels=['Bottom 90%', 'Middle 9%', 'Top 1%'])
user_tweet_count_df.head()
tweet_type | original | quote | reply | retweet | tweets_in_dataset | tweets_in_dataset_bin |
---|---|---|---|---|---|---|
user_id | ||||||
1001991865 | 12.00 | 1.00 | 3.00 | 35.00 | 51.00 | Bottom 90% |
1002229862 | 35.00 | 5.00 | 2.00 | 99.00 | 141.00 | Bottom 90% |
100802089 | 4.00 | 3.00 | 5.00 | 12.00 | 24.00 | Bottom 90% |
100860790 | 117.00 | 19.00 | 9.00 | 215.00 | 360.00 | Bottom 90% |
1009749229 | 79.00 | 85.00 | 34.00 | 156.00 | 354.00 | Bottom 90% |
This is information that was coded in the spreadsheet or looked up for each user via API.
user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['screen_name', 'user_id', 'name', 'organization', 'position',
'gender', 'followers_count', 'following_count', 'tweet_count',
'user_created_at', 'verified', 'protected'],
dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()
screen_name 2484 name 2484 organization 2455 position 2481 gender 2483 followers_count 2484 following_count 2484 tweet_count 2484 user_created_at 2484 verified 2484 protected 2484 dtype: int64
user_info_df.head()
screen_name | name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | |
---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||
20711445 | ninglin | Glinski, Nina | NaN | Freelance Reporter | F | 968 | 507 | 909 | Thu Feb 12 20:00:53 +0000 2009 | False | False |
258917371 | davidjenders | Enders, David | NaN | Journalist | M | 1451 | 480 | 6299 | Mon Feb 28 19:52:03 +0000 2011 | True | False |
297046834 | mattbarakat | Barakat, Matthew | Associated Press | Northern Virginia Correspondent | M | 754 | 349 | 620 | Wed May 11 20:55:24 +0000 2011 | True | False |
455585786 | kimberlyeatkins | Atkins, Kimberly | Boston Herald | Chief Washington Reporter/Columnist | F | 2399 | 2661 | 5846 | Thu Jan 05 08:26:46 +0000 2012 | True | False |
42584840 | toulavlahou | Vlahou, Toula | CQ Roll Call | Editor & Podcast Producer | F | 2713 | 198 | 6325 | Tue May 26 07:41:38 +0000 2009 | False | False |
# Join
user_summary_df = user_info_df.join(user_tweet_count_df, how='left')
# Fill Nans
user_summary_df['organization'].fillna('', inplace=True)
user_summary_df['original'].fillna(0, inplace=True)
user_summary_df['quote'].fillna(0, inplace=True)
user_summary_df['reply'].fillna(0, inplace=True)
user_summary_df['retweet'].fillna(0, inplace=True)
user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)
user_summary_df.count()
screen_name 2484 name 2484 organization 2484 position 2481 gender 2483 followers_count 2484 following_count 2484 tweet_count 2484 user_created_at 2484 verified 2484 protected 2484 original 2484 quote 2484 reply 2484 retweet 2484 tweets_in_dataset 2484 tweets_in_dataset_bin 2272 dtype: int64
user_summary_df.head()
screen_name | name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | original | quote | reply | retweet | tweets_in_dataset | tweets_in_dataset_bin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||
20711445 | ninglin | Glinski, Nina | Freelance Reporter | F | 968 | 507 | 909 | Thu Feb 12 20:00:53 +0000 2009 | False | False | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | |
258917371 | davidjenders | Enders, David | Journalist | M | 1451 | 480 | 6299 | Mon Feb 28 19:52:03 +0000 2011 | True | False | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | |
297046834 | mattbarakat | Barakat, Matthew | Associated Press | Northern Virginia Correspondent | M | 754 | 349 | 620 | Wed May 11 20:55:24 +0000 2011 | True | False | 12.00 | 0.00 | 0.00 | 2.00 | 14.00 | Bottom 90% |
455585786 | kimberlyeatkins | Atkins, Kimberly | Boston Herald | Chief Washington Reporter/Columnist | F | 2399 | 2661 | 5846 | Thu Jan 05 08:26:46 +0000 2012 | True | False | 228.00 | 144.00 | 39.00 | 196.00 | 607.00 | Bottom 90% |
42584840 | toulavlahou | Vlahou, Toula | CQ Roll Call | Editor & Podcast Producer | F | 2713 | 198 | 6325 | Tue May 26 07:41:38 +0000 2009 | False | False | 32.00 | 25.00 | 0.00 | 25.00 | 82.00 | Bottom 90% |
user_summary_df.to_csv('output/user_summary.csv')
This is for users that are members of each organization.
org_summary_df = user_summary_df[['organization', 'followers_count', 'following_count', 'tweet_count', 'tweets_in_dataset']].groupby('organization').agg([np.sum, np.size, np.average])
org_summary_df.count()
followers_count sum 347 size 347 average 347 following_count sum 347 size 347 average 347 tweet_count sum 347 size 347 average 347 tweets_in_dataset sum 347 size 347 average 347 dtype: int64
org_summary_df.head()
followers_count | following_count | tweet_count | tweets_in_dataset | |||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
sum | size | average | sum | size | average | sum | size | average | sum | size | average | |
organization | ||||||||||||
57347 | 29 | 1,977.48 | 30788 | 29 | 1,061.66 | 151441 | 29 | 5,222.10 | 2,767.00 | 29.00 | 95.41 | |
ABC 7 | 889 | 1 | 889.00 | 1092 | 1 | 1,092.00 | 1946 | 1 | 1,946.00 | 464.00 | 1.00 | 464.00 |
ABC News | 602790 | 52 | 11,592.12 | 72154 | 52 | 1,387.58 | 372200 | 52 | 7,157.69 | 8,629.00 | 52.00 | 165.94 |
AP–Broadcast | 5305 | 15 | 353.67 | 7974 | 15 | 531.60 | 16794 | 15 | 1,119.60 | 527.00 | 15.00 | 35.13 |
Afro American Newspapers | 189 | 1 | 189.00 | 202 | 1 | 202.00 | 596 | 1 | 596.00 | 14.00 | 1.00 | 14.00 |
org_summary_df.to_csv('output/organization_summary.csv')
org_summary_df.index.tolist()
['', 'ABC 7', 'ABC News', 'AP–Broadcast', 'Afro American Newspapers', 'Agence France Presse (AFP–TV)', 'Agence France-Presse', 'Agri-Pulse', 'Air Force Magazine', 'Alaska Dispatch News', 'Alaska Public Radio Network', 'Albuquerque Journal', 'Aljazeera America', 'Aljazeera English', 'Allentown Morning Call', 'American Banker', 'American Gaming Association', 'American Prospect', 'Argus Media', 'Army Times', 'Associated Press', 'Atlanta Journal-Consitution', 'Austin American-Statesman', 'Axios', 'BBC', 'Baltimore Sun', 'Bankrate', 'Bloomberg BNA', 'Bloomberg Government', 'Bloomberg News', 'Bloomberg TV', 'Bond Buyer', 'Boston Globe', 'Boston Herald', 'Breitbart News', 'Broadcasting & Cable', 'Buffalo News', 'BuzzFeed', 'Buzzfeed', 'CBN News', 'CBS News', 'CDC Gaming Reports', 'CEO Update', 'CNBC', 'CNN', 'CNN International', 'CNSNews.com', 'CQ Researcher', 'CQ Roll Call', 'CRTV', 'CTV–Community TV of PG County', 'Canadian Press', 'Carroll County Times', 'Center for Public Integrity', 'Charleston Post and Courier', 'Chicago Sun-Times', 'Chicago Tribune', 'Christian Science Monitor', 'Chronicle of Higher Education', 'Chronicle of Philanthropy', 'Circa', 'CityLab', 'Cleveland Plain Dealer', 'Colorado Public Radio', 'Columbus Dispatch', 'Communications Daily', 'Consumer Reports', 'Cook Political Report', 'Corporate Crime Reporter', 'Cosmopolitan', 'Court House News', 'Cox Broadcasting', 'Crain Communications', 'Cronkite News Service', 'Crux: Catholic News Agency', 'C–SPAN', 'DC Spotlight Newspaper', 'DCist', 'Daily Beast', 'Daily Caller', 'Daily Deal', 'Daily Mail', 'Daily Mail (UK)', 'Dallas Morning News', 'Defense Daily', 'Defense News', 'Defense One', 'Denver Post', 'Detroit News', 'Diverse: Issues in Higher Education', 'E! Networks', 'E&E News', 'EWTN', 'Eater', 'Economist', 'Education Week', 'Energy Daily', 'Energy Intelligence', 'Environment & Energy Publishing, LLC', "FERN's Ag Insider", 'FTC Watch', 'Fairchild Publications', 'Falls Church News Press', 'Famous DC', 'Feature Story News', 'FedNet', 'Federal Computer Week', 'Federal News Radio 1500 AM', 'Financial Times', 'Financial Times ', 'Fiscal Times', 'FiveThirtyEight', 'Foreign Policy', 'Fortune Magazine', 'Fox Business Network', 'Fox News', 'Fox News Radio', 'France24', 'Freelance', 'Freelance ', 'Frontline Medical Communications', 'Fusion', 'Gannett Government Media Corp', 'Gannett Washington Bureau', 'Glamour Magazine', 'Global Competition Review', 'Globe and Mail', 'Governing', 'Government Executive', 'Gray Television', 'Guardian US', 'Haddad Media', 'Hearst Newspapers', 'Hearst Television Inc.', 'Hispanic Outlook', 'Honolulu Civil Beat', 'Houston Chronicle', 'Huffington Post', 'IDG Communications', 'IDG News Service', 'Independent Journal Review', 'Independent Television News (ITN)', 'Industry Dive', 'Informavore Media, LLC', 'Inside Elections', 'InsideClimate News', 'InsidePolitics', 'Internews Network', 'Investor’s Business Daily', 'Irish Times', 'Jewish Journal', 'Jewish Telegraphic Agency', 'Journal Media Group', 'KATU News', 'KCETLink', 'KFI', 'KNTV', 'KTWO TV', 'Kaiser Health News', 'Kansas City Star', 'LRP Publications', 'Laslo Congressional Bureau', 'Lilly Broadcasting', 'LocalNews Now', 'Los Angeles Times', 'MLEX US', 'MRCTV', 'MSNBC', 'MTV News', 'Manifest', 'MapLight', 'Market News International', 'MarketWatch', 'Marketplace Radio', 'McClatchy', 'McClatchy Newspapers', 'MedPage Today', 'MedTech Insight', 'Media General', 'Merger Market of Financial Times', 'Metro Weekly', 'Mic', 'Military.com', 'MinnPost', 'Minneapolis Star Tribune', 'Montgomery County Sentinel', 'Morning Consult', 'Morning Edition', 'Mother Jones', 'NBC', 'NBC News', 'NBC Newschannel', 'NJ Advance Media', 'Nation', 'National Catholic Reporter', 'National Geographic Magazine', 'National Journal', 'National Law Journal', 'National Mortgage News', 'National Public Radio', 'National Review', 'Nature', 'NerdWallet', 'New Republic', 'New York ', 'New York Daily News', 'New York Post', 'New York Times', 'New York Times Magazine', 'New Yorker', 'NewsMax', 'Newsday', 'Newsweek', 'Nexstar Media Group', 'Omaha World-Herald', 'Ozy', 'PBS', 'PBS NewsHour', 'People Magazine', 'Pew Charitable Trusts', 'Philadelphia Inquirer', 'Pittsburgh Post-Gazette', 'Politico', 'Power Markets Today', 'Praetorian Digital', 'ProPublica', 'RTTV America', 'Radio Free Asia', 'Radio One', 'Real Clear Politics', 'Real News Network', 'RealClearPolitics', 'Record (Bergen County, NJ)', 'Religion & Ethics Newsweekly', 'Religion News Service', 'Religious News Service', 'Reuters Radio & TV', 'Rural TV News', 'S&P Global Market Intelligence', 'S&P Global Platts', 'SAGE Business Researcher', 'SB Nation', 'SRN News (Salem)', 'Salt Lake Tribune', 'San Francisco Chronicle', 'Scientific American', 'Scoop News', 'Scripps Howard News Service', 'Scripps News', 'Scudder Publishing', 'Senate Democrats', 'Sightline Media Group', 'Sinclair Broadcast Group', 'Sirius XM', 'Sirius XM Satellite Radio', 'Sky News', 'Slate', 'Smithsonian Magazine', 'Snapchat', 'Space News', 'St. Louis Post-Dispatch', 'St. Louis Public Radio', 'Standard - Examiner', 'Stars and Stripes', 'Stat News', 'Stateline.org', 'Stephens Media Group', 'SurveyMonkey', 'Syracuse Post-Standard', 'TEGNA', 'Talk Radio News Service', 'Talking Points Memo', 'Tampa Bay Times', 'Telemundo Network', 'Texas Tribune', 'The 74 Media', 'The Atlantic', 'The Cipher Brief', 'The Hill', 'The Hotline', 'The New York Times On The Web', 'The Root', 'The Voyage Report', 'The atlantic', 'TheStreet', 'ThinkProgress', 'This Is America with Dennis Wholey', 'Thom Hartmann Program', 'Thomsen Reuters', 'Thomson Reuters', 'Time Magazine', 'Time Warner Cable', 'Times of London', 'To The Contrary (Persephone Productions)', 'Toronto Star', 'TownHall', 'Townhall', 'Transport Topics', 'Trinity Broadcast Network', 'U.S. News & World Report', 'UCG', 'USA Today', 'Univision', 'Vanity Fair', 'Variety', 'Vice News', 'Voice of America', 'Voterama in Congress', 'Vox ', 'Vox Media', 'WBAL-TV', 'WBALL TV 11', 'WFDC–TV Univision', 'WJLA–TV / Newschannel 8', 'WMAL Radio', 'WMDT', 'WNEW / CBS DC', 'WNYC', 'WPFW–FM', 'WRC–TV / NBC–4', 'WTOP', 'WTOP Radio', 'WTTG-TV', 'WTTG–Fox Television', 'WUSA–TV', 'Wall Street Journal', 'Wall Street Journal / Dow Jones', 'Washington Blade', 'Washington Bureau News Service', 'Washington Business Journal', 'Washington City Paper', 'Washington Examiner', 'Washington Free Beacon', 'Washington Post', 'Washington Radio & Press Service', 'Washington Times', 'Washingtonian', 'Washingtonpost.com', 'Weekly Standard', 'West Wing Writers', 'Westwood One', 'White House Dossier', 'Wired', 'Wisconsin NewsHour', 'World Magazine', 'Yahoo News', 'ZDNet']
For tweets in dataset.
tweet_df['tweet_type'].value_counts()
retweet 273412 original 199949 reply 93184 quote 83805 Name: tweet_type, dtype: int64
user_summary_df[['original', 'quote', 'reply', 'retweet']].describe()
original | quote | reply | retweet | |
---|---|---|---|---|
count | 2,484.00 | 2,484.00 | 2,484.00 | 2,484.00 |
mean | 79.83 | 33.54 | 37.22 | 109.56 |
std | 135.84 | 90.07 | 186.34 | 341.02 |
min | 0.00 | 0.00 | 0.00 | 0.00 |
25% | 5.00 | 0.00 | 0.00 | 3.00 |
50% | 29.00 | 5.00 | 3.00 | 24.00 |
75% | 99.00 | 28.00 | 18.00 | 94.25 |
max | 1,579.00 | 1,440.00 | 7,328.00 | 8,855.00 |
For top 1%, 9%, 90% of tweeters, the number of tweets and types of tweets they account for.
user_summary_df[user_summary_df.tweets_in_dataset_bin == 'Top 1%']
screen_name | name | organization | position | gender | followers_count | following_count | tweet_count | user_created_at | verified | protected | original | quote | reply | retweet | tweets_in_dataset | tweets_in_dataset_bin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||
456994513 | maria_e_recio | Recio, Maria | Austin American-Statesman | Political Reporter | F | 1039 | 530 | 38464 | Fri Jan 06 22:22:40 +0000 2012 | False | False | 261.00 | 291.00 | 108.00 | 3,204.00 | 3,864.00 | Top 1% |
22891564 | chrisgeidner | Geidner, Chris | BuzzFeed | Legal Editor & Supreme Court Correspondent | M | 78631 | 4767 | 201131 | Thu Mar 05 06:48:00 +0000 2009 | True | False | 592.00 | 475.00 | 2,850.00 | 750.00 | 4,667.00 | Top 1% |
21810329 | sdonnan | Donnan, Shawn | Financial Times | Wolrd Trade Editor | M | 11693 | 5428 | 75733 | Tue Feb 24 23:10:17 +0000 2009 | True | False | 203.00 | 374.00 | 152.00 | 2,792.00 | 3,521.00 | Top 1% |
19545932 | kampeas | Kampeas, Ron | Jewish Telegraphic Agency | Washington Bureau Chief | M | 6901 | 1952 | 50954 | Mon Jan 26 17:37:58 +0000 2009 | False | False | 506.00 | 349.00 | 202.00 | 2,027.00 | 3,084.00 | Top 1% |
47408060 | jonathanlanday | Landay, Jonathan | McClatchy Newspapers | National Security Correspondent | M | 11126 | 1093 | 78318 | Mon Jun 15 18:42:47 +0000 2009 | True | False | 418.00 | 41.00 | 70.00 | 2,352.00 | 2,881.00 | Top 1% |
3817401 | ericgeller | Geller, Eric | Politico | Cybersecurity Reporter | M | 52569 | 732 | 201279 | Sun Apr 08 20:27:11 +0000 2007 | True | False | 820.00 | 1,435.00 | 7,328.00 | 0.00 | 9,583.00 | Top 1% |
593813785 | donnayoungdc | Young, Donna | S&P Global Market Intelligence | Senior Reporter | F | 5654 | 1621 | 46571 | Tue May 29 15:45:45 +0000 2012 | False | False | 1,095.00 | 885.00 | 9.00 | 1,169.00 | 3,158.00 | Top 1% |
104299137 | davidmdrucker | Drucker, David | Washington Examiner | Senior Political Correspondent | M | 32966 | 2475 | 101229 | Tue Jan 12 22:56:50 +0000 2010 | True | False | 611.00 | 1,122.00 | 517.00 | 934.00 | 3,184.00 | Top 1% |
61734492 | fahrenthold | Fahrenthold, David | Washington Post | Political Reporter | M | 419647 | 3341 | 25457 | Fri Jul 31 09:29:37 +0000 2009 | True | False | 115.00 | 142.00 | 63.00 | 2,333.00 | 2,653.00 | Top 1% |
13524182 | daveweigel | Weigel, David | Washington Post | Political Reporter | M | 318915 | 10169 | 166821 | Fri Feb 15 17:58:23 +0000 2008 | True | False | 712.00 | 784.00 | 242.00 | 2,155.00 | 3,893.00 | Top 1% |
25702314 | ericmgarcia | Garcia, Eric M. | CQ Roll Call | Reporter | M | 2960 | 3748 | 42198 | Sat Mar 21 17:44:40 +0000 2009 | False | False | 441.00 | 1,188.00 | 575.00 | 405.00 | 2,609.00 | Top 1% |
18825339 | cahnemily | Cahn, Emily | Mic | Senior Politics Writer | F | 16181 | 2118 | 95033 | Sat Jan 10 03:19:50 +0000 2009 | True | False | 1,205.00 | 1,440.00 | 279.00 | 3,459.00 | 6,383.00 | Top 1% |
21612122 | hotlinejosh | Kraushaar, Josh P. | National Journal | Politics Editor | M | 49151 | 1456 | 152116 | Sun Feb 22 23:45:46 +0000 2009 | True | False | 395.00 | 643.00 | 338.00 | 4,302.00 | 5,678.00 | Top 1% |
21696279 | brianbeutler | Beutler, Brian Alfred | New Republic | Senior Editor | M | 71586 | 722 | 96050 | Mon Feb 23 21:31:16 +0000 2009 | True | False | 475.00 | 546.00 | 714.00 | 2,122.00 | 3,857.00 | Top 1% |
16459325 | ryanbeckwith | Beckwith, Ryan Teague | Time Magazine | Politics Editor | M | 20241 | 6826 | 88797 | Thu Sep 25 22:43:36 +0000 2008 | True | False | 843.00 | 529.00 | 753.00 | 1,778.00 | 3,903.00 | Top 1% |
42352386 | rschles | Schlesinger, Robert | U.S. News & World Report | Managing Editor, Opinion | M | 4426 | 1910 | 34044 | Mon May 25 04:52:44 +0000 2009 | True | False | 122.00 | 590.00 | 56.00 | 2,206.00 | 2,974.00 | Top 1% |
304988603 | neilwmccabe | McCabe, Neil | Breitbart News | Political Correspondent | M | 18991 | 7699 | 57983 | Wed May 25 13:09:32 +0000 2011 | False | False | 682.00 | 616.00 | 227.00 | 4,444.00 | 5,969.00 | Top 1% |
191964162 | samlitzinger | Litzinger, Sam | CBS News | Correspondent, CBS Radio | M | 2302 | 2164 | 90023 | Fri Sep 17 20:37:31 +0000 2010 | False | False | 759.00 | 206.00 | 430.00 | 5,331.00 | 6,726.00 | Top 1% |
259395895 | johnjharwood | Harwood, John | CNBC | Chief Washington Correspondent | M | 139370 | 1157 | 73724 | Tue Mar 01 20:49:40 +0000 2011 | True | False | 825.00 | 487.00 | 83.00 | 3,307.00 | 4,702.00 | Top 1% |
14529929 | jaketapper | Tapper, Jake | CNN | Anchor & Chief Washington Correspondent | M | 1238317 | 5664 | 144300 | Fri Apr 25 17:23:28 +0000 2008 | True | False | 1,162.00 | 266.00 | 645.00 | 1,295.00 | 3,368.00 | Top 1% |
15486163 | simonmarksfsn | Marks, Simon | Feature Story News | President & Chief Correspondent | M | 7622 | 3632 | 39421 | Fri Jul 18 20:45:38 +0000 2008 | False | False | 1,191.00 | 613.00 | 189.00 | 1,017.00 | 3,010.00 | Top 1% |
19576571 | jaredrizzi | Rizzi, Jared | Sirius XM Satellite Radio | White House Correspondent, SXMPOTUS | M | 12277 | 5924 | 38049 | Tue Jan 27 04:09:53 +0000 2009 | True | False | 645.00 | 858.00 | 1,393.00 | 2,050.00 | 4,946.00 | Top 1% |
2453025128 | gloriaminott | Minott, Gloria | WPFW–FM | Journalist and Radio Host | F | 468 | 232 | 45438 | Sat Apr 19 12:03:52 +0000 2014 | False | False | 0.00 | 0.00 | 1.00 | 8,855.00 | 8,856.00 | Top 1% |
tweets_in_dataset_bin_summary_df = user_summary_df[['original', 'quote', 'reply', 'retweet', 'tweets_in_dataset', 'tweets_in_dataset_bin']].groupby('tweets_in_dataset_bin').sum()
tweets_in_dataset_bin_summary_df['percent_of_original'] = tweets_in_dataset_bin_summary_df.original / tweets_in_dataset_bin_summary_df.original.sum()
tweets_in_dataset_bin_summary_df['percent_of_quote'] = tweets_in_dataset_bin_summary_df.quote / tweets_in_dataset_bin_summary_df.quote.sum()
tweets_in_dataset_bin_summary_df['percent_of_reply'] = tweets_in_dataset_bin_summary_df.reply / tweets_in_dataset_bin_summary_df.reply.sum()
tweets_in_dataset_bin_summary_df['percent_of_retweets'] = tweets_in_dataset_bin_summary_df.retweet / tweets_in_dataset_bin_summary_df.retweet.sum()
tweets_in_dataset_bin_summary_df['percent_of_tweets_in_dataset'] = tweets_in_dataset_bin_summary_df.tweets_in_dataset / tweets_in_dataset_bin_summary_df.tweets_in_dataset.sum()
tweets_in_dataset_bin_summary_df['users_in_bin'] = user_summary_df[['tweets_in_dataset_bin', 'tweets_in_dataset']].groupby('tweets_in_dataset_bin').count()
tweets_in_dataset_bin_summary_df
original | quote | reply | retweet | tweets_in_dataset | percent_of_original | percent_of_quote | percent_of_reply | percent_of_retweets | percent_of_tweets_in_dataset | users_in_bin | |
---|---|---|---|---|---|---|---|---|---|---|---|
tweets_in_dataset_bin | |||||||||||
Bottom 90% | 118,274.00 | 36,419.00 | 31,546.00 | 116,400.00 | 302,639.00 | 0.60 | 0.44 | 0.34 | 0.43 | 0.47 | 2043 |
Middle 9% | 65,947.00 | 33,018.00 | 43,692.00 | 97,456.00 | 240,113.00 | 0.33 | 0.40 | 0.47 | 0.36 | 0.37 | 206 |
Top 1% | 14,078.00 | 13,880.00 | 17,224.00 | 58,287.00 | 103,469.00 | 0.07 | 0.17 | 0.19 | 0.21 | 0.16 | 23 |
user_summary_df[['followers_count', 'following_count', 'tweet_count']].describe()
followers_count | following_count | tweet_count | |
---|---|---|---|
count | 2,484.00 | 2,484.00 | 2,484.00 |
mean | 14,644.39 | 1,344.52 | 8,760.62 |
std | 84,477.36 | 2,805.21 | 15,836.17 |
min | 0.00 | 0.00 | 0.00 |
25% | 659.00 | 428.00 | 1,001.25 |
50% | 2,114.00 | 933.00 | 3,578.00 |
75% | 6,611.00 | 1,621.50 | 9,572.00 |
max | 2,133,806.00 | 94,689.00 | 201,279.00 |
user_summary_df['gender'].value_counts()
M 1398 F 1085 Name: gender, dtype: int64
org_summary_df[['followers_count']].sort_values([('followers_count', 'average')], ascending=False).head()
followers_count | |||
---|---|---|---|
sum | size | average | |
organization | |||
MSNBC | 1732992 | 7 | 247,570.29 |
Toronto Star | 165056 | 1 | 165,056.00 |
New York | 125754 | 1 | 125,754.00 |
New Yorker | 125180 | 1 | 125,180.00 |
MTV News | 101473 | 1 | 101,473.00 |
org_summary_df[['following_count']].sort_values([('following_count', 'average')], ascending=False).head()
following_count | |||
---|---|---|---|
sum | size | average | |
organization | |||
White House Dossier | 7441 | 1 | 7,441.00 |
Snapchat | 6019 | 1 | 6,019.00 |
Bankrate | 5853 | 1 | 5,853.00 |
New York Daily News | 4288 | 1 | 4,288.00 |
Texas Tribune | 3935 | 1 | 3,935.00 |
org_summary_df[['tweet_count']].sort_values([('tweet_count', 'average')], ascending=False).head()
tweet_count | |||
---|---|---|---|
sum | size | average | |
organization | |||
New Republic | 96050 | 1 | 96,050.00 |
Mic | 95033 | 1 | 95,033.00 |
Yahoo News | 93714 | 1 | 93,714.00 |
MTV News | 80962 | 1 | 80,962.00 |
ProPublica | 78207 | 1 | 78,207.00 |
org_summary_df[['tweets_in_dataset']].sort_values([('tweets_in_dataset', 'sum')], ascending=False).head()
tweets_in_dataset | |||
---|---|---|---|
sum | size | average | |
organization | |||
Politico | 43,669.00 | 103.00 | 423.97 |
CNN | 33,868.00 | 149.00 | 227.30 |
Washington Post | 22,621.00 | 60.00 | 377.02 |
Bloomberg News | 17,558.00 | 75.00 | 234.11 |
CBS News | 17,036.00 | 61.00 | 279.28 |
# Get the first tweet for each user
first_tweet_df = tweet_df.loc[tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id'])
first_tweet_df.count()
tweet_id 2293 screen_name 2293 tweet_created_at 2293 user_created_at 2293 tweets_to_date 2293 tweet_type 2293 dtype: int64
first_tweet_df.sort_values('tweet_created_at', ascending=False).head()
tweet_id | screen_name | tweet_created_at | user_created_at | tweets_to_date | tweet_type | |
---|---|---|---|---|---|---|
user_id | ||||||
16338087 | 876092563563958272 | AbbyDanzig | 2017-06-17 15:01:58+00:00 | 2008-09-17 22:10:27+00:00 | 1542 | retweet |
3901972468 | 875730040750604288 | jchamseddine10 | 2017-06-16 15:01:26+00:00 | 2015-10-08 18:44:17+00:00 | 605 | original |
198935531 | 875477217895231488 | CarrieStevenson | 2017-06-15 22:16:48+00:00 | 2010-10-05 16:30:31+00:00 | 438 | original |
267210696 | 875005803283050496 | PeteBehrEENews | 2017-06-14 15:03:34+00:00 | 2011-03-16 14:28:09+00:00 | 24 | original |
425112739 | 874967586085244930 | jzieglerWTOP | 2017-06-14 12:31:43+00:00 | 2011-11-30 15:37:28+00:00 | 815 | retweet |
first_tweet_df['tweet_created_at'].max()
Timestamp('2017-06-17 15:01:58+0000', tz='UTC')