#!/usr/bin/env python # coding: utf-8 # In[104]: runtime_meta() # # USNPL # A scraper for the usnpl site for newspapers, magazines, and college papers. # # View this on [Github](https://github.com/yinleon/usnpl/blob/master/scrape_usnpl.ipynb).
# View this on [NBViewer](https://nbviewer.jupyter.org/github/yinleon/usnpl/blob/master/scrape_usnpl.ipynb?flush=true).
# Visit my lab's [website](https://wp.nyu.edu/smapp/) # # ### The output has # Geography - The state of the media source.
# Name - The name of the media source.
# Facebook - The URL for the Facebook account.
# Twitter_Name - The Tweet screen name for the Twitter account.
# Twitter_ID - The Tweet ID for the Twitter Account.
# Website - The URL for the media source
# Medium - What format is the media source? # # The output is publically avaiable as a csv here.
# ... and you can access it from the Web using Pandas: # In[46]: import pandas as pd link = 'https://raw.githubusercontent.com/yinleon/usnpl/master/data/usnpl_newspapers_twitter_ids.csv' df = pd.read_csv(link, dtype={'Twitter_ID' : str}) df.head() # The Python modules required to run this is found in this file, and can be downloaded in the notebook with the command below: # In[41]: get_ipython().system('pip install -r requirements.txt') # With the requirements downloaded, you can use the code below to scrape the website. # In[1]: import os import time import requests import tweepy import pandas as pd from tqdm import tqdm_notebook as tqdm from bs4 import BeautifulSoup # In[2]: states = '''ak al ar az ca co ct dc de fl ga hi ia id il in ks ky la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh ok or pa ri sc sd tn tx ut va vt wa wi wv wy ''' states = [s.strip() for s in states.split(' ')] # In[3]: def parse_row(soup): ''' For each media publication in the html, we're going to strip the city name, the publication name, the website url, and social links (if they exist) The input `soup` is a beautiful soup object. the output is a dict of the parsed fields. ''' city = soup.find('b').text name = soup.find('a').text web = soup.find('a').get('href') fb = soup.find('a', text='F') if fb: fb= fb.get('href') tw = soup.find('a', text='T') if tw: tw=tw.get('href').replace('http://www.twitter.com/', '').rstrip('/') return { 'Facebook' : fb, 'Twitter_Name' : tw, 'Name' : name, 'Website' : web } # We will use that function and apply it to sections of the website where the data is. The output of each of these sections will be appended to the list `sites`, and create a list of dictionaries. # # Line-by-line, we are looking through each 2-letter state abbreviation, collecting the html of that state's page, soupifying it (so that we can parse it), parsing out the fields we're interested in, and then appending those results to a list. The if/else statements regarding medium differ because the html holding such information also differs. # In[45]: sites = [] for state in states: url = 'http://www.usnpl.com/{}news.php'.format(state) r = requests.get(url) soup = BeautifulSoup(r.content, 'lxml') data_possibilities = soup.find_all('div' ,{"id" : 'data_box'}) for i, raw_table in enumerate(data_possibilities[1:]): j = 1 if i == 0 else 0 medium = raw_table.find('h3').text if medium == 'Newspapers': data_table = str(raw_table).split('

\n\n')[j] entries_to_parse = data_table.rstrip('').split('\n
\n') elif medium in ['Magazines', 'College Newspapers']: data_table = str(raw_table).split('Untitled Document')[1] entries_to_parse = data_table.rstrip('').split('\n
\n') else: break for row in tqdm(entries_to_parse): row = row.strip('\r').strip('\n') if row: entry = parse_row(BeautifulSoup(row, 'lxml')) entry['Geography'] = state.upper() entry['Medium'] = medium sites.append(entry) time.sleep(1) # That list of dictionaries is now a perfect format to place into a Pandas dataframe. # In[5]: df = pd.DataFrame(sites) # In[8]: df['Website'] = df['Website'].str.rstrip('/') # In[9]: df.head() # In[10]: # Let's save it df.to_csv('data/usnpl_newspapers.csv', index=False) # ### Getting Twitter User IDs # We want to get the Twitter ID, in addition to the screename. This section uses Tweepy to get such info. # In[20]: # fill these in! consumer_key = '' consumer_secret = '' access_key = '' access_secret = '' # In[22]: auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) # In[23]: # filter the media outlets for unique Twitter Names to query. twitter_names = df[~df['Twitter_Name'].isnull()]['Twitter_Name'].unique() # This is an example of an API call for user metadata, and what kind of info we get back. # In[24]: user = api.get_user(twitter_names[0]) # In[25]: user._json # I wonder what we can win in the raffle w/ our hunting liscence?
# You don't need a hunting liscence to scrape the web and do some whacky analysis! # # Let's iterate through the unique Twitter usernames from our media outlets, make the API call, and store the results we're interested in as a dictionary. # In[27]: from tweepy import TweepError # In[28]: user_ids = [] for screen_name in tqdm(twitter_names): try: user = api.get_user(screen_name=screen_name) user_id = user.id_str except TweepError: user_id = None pass user_ids.append({ 'Twitter_ID' : user_id, 'Twitter_Name' : screen_name }) # Let's save the output, and merge it back into the dataframe of USNPL fields we parsed out. # In[34]: df_users = pd.DataFrame(user_ids) # In[35]: df_users.to_csv('data/twitter_users.csv', index=False) # In[40]: df_merge = df.merge(df_users, on='Twitter_Name', how='left')[['Name', 'Medium', 'Website', 'Facebook', 'Twitter_Name','Twitter_ID', 'Geography']] df_merge.to_csv('data/usnpl_newspapers_twitter_ids.csv', index=False) # And that is how we scrape a website and get a nice csv. # In[39]: df_merge[df_merge['Name'] == 'Fairbanks Daily News-Miner'] # We can aggregate this data get an idea of the media landscape across states. # In[41]: get_ipython().run_line_magic('matplotlib', 'inline') # In[43]: df_ = df_merge[df_merge['Geography'].isin(['WY', 'AZ', 'MA', 'NY'])] df_ = pd.crosstab(df_['Geography'], df_['Medium']) # In[44]: df_.plot(kind='barh', title="Num of State-Level Media Outlets");