#!/usr/bin/env python
# coding: utf-8
# In[104]:
runtime_meta()
# # USNPL
# A scraper for the usnpl site for newspapers, magazines, and college papers.
#
# View this on [Github](https://github.com/yinleon/usnpl/blob/master/scrape_usnpl.ipynb).
# View this on [NBViewer](https://nbviewer.jupyter.org/github/yinleon/usnpl/blob/master/scrape_usnpl.ipynb?flush=true).
# Visit my lab's [website](https://wp.nyu.edu/smapp/)
#
# ### The output has
# Geography - The state of the media source.
# Name - The name of the media source.
# Facebook - The URL for the Facebook account.
# Twitter_Name - The Tweet screen name for the Twitter account.
# Twitter_ID - The Tweet ID for the Twitter Account.
# Website - The URL for the media source
# Medium - What format is the media source?
#
# The output is publically avaiable as a csv here.
# ... and you can access it from the Web using Pandas:
# In[46]:
import pandas as pd
link = 'https://raw.githubusercontent.com/yinleon/usnpl/master/data/usnpl_newspapers_twitter_ids.csv'
df = pd.read_csv(link, dtype={'Twitter_ID' : str})
df.head()
# The Python modules required to run this is found in this file, and can be downloaded in the notebook with the command below:
# In[41]:
get_ipython().system('pip install -r requirements.txt')
# With the requirements downloaded, you can use the code below to scrape the website.
# In[1]:
import os
import time
import requests
import tweepy
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from bs4 import BeautifulSoup
# In[2]:
states = '''ak al ar az ca co ct dc de fl ga hi ia id il in ks ky la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh ok or pa ri sc sd tn tx ut va vt wa wi wv wy '''
states = [s.strip() for s in states.split(' ')]
# In[3]:
def parse_row(soup):
'''
For each media publication in the html,
we're going to strip the city name, the publication name,
the website url, and social links (if they exist)
The input `soup` is a beautiful soup object.
the output is a dict of the parsed fields.
'''
city = soup.find('b').text
name = soup.find('a').text
web = soup.find('a').get('href')
fb = soup.find('a', text='F')
if fb:
fb= fb.get('href')
tw = soup.find('a', text='T')
if tw:
tw=tw.get('href').replace('http://www.twitter.com/', '').rstrip('/')
return {
'Facebook' : fb,
'Twitter_Name' : tw,
'Name' : name,
'Website' : web
}
# We will use that function and apply it to sections of the website where the data is. The output of each of these sections will be appended to the list `sites`, and create a list of dictionaries.
#
# Line-by-line, we are looking through each 2-letter state abbreviation, collecting the html of that state's page, soupifying it (so that we can parse it), parsing out the fields we're interested in, and then appending those results to a list. The if/else statements regarding medium differ because the html holding such information also differs.
# In[45]:
sites = []
for state in states:
url = 'http://www.usnpl.com/{}news.php'.format(state)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
data_possibilities = soup.find_all('div' ,{"id" : 'data_box'})
for i, raw_table in enumerate(data_possibilities[1:]):
j = 1 if i == 0 else 0
medium = raw_table.find('h3').text
if medium == 'Newspapers':
data_table = str(raw_table).split('
\n\n')[j]
entries_to_parse = data_table.rstrip('').split('\n
\n')
elif medium in ['Magazines', 'College Newspapers']:
data_table = str(raw_table).split('