#!/usr/bin/env python
# coding: utf-8

# # Pinboard
# 
# Pinboard is a social bookmarking site where people share links to content and *tag* them by assigning a word that describes the content. These tags are free-form, and each user decides which ones to use.
# 
# Pinboard has a nice [API](https://pinboard.in/api/) for interacting with your own bookmarks, but not for getting all public bookmarks for a tag. Pinboard also makes all tag pages available as RSS, e.g. https://feeds.pinboard.in/rss/t:covid-19 but it unfortunately doesn't allow paging back in time.
# 
# So unfortunately we're going to have to scrape the pages. But fortunately this won't be too difficult with the [requests_html](https://requests-html.kennethreitz.org/) module because Pinboard has done such a nice job of using [semantic html](https://en.wikipedia.org/wiki/Semantic_HTML).

# In[1]:


import time
import requests_html
import dateutil.parser

def pinboard(hashtag):
    http = requests_html.HTMLSession()
    pinboard_url = 'https://pinboard.in/t:{}'.format(hashtag)
    while True:
        resp = http.get(pinboard_url)
        bookmarks = resp.html.find('.bookmark')
        for b in bookmarks:
            a = b.find('.bookmark_title', first=True)
            yield {
                'url': a.attrs['href'],
                'title': a.text,
                'created': dateutil.parser.parse(b.find('.when', first=True).attrs['title'])
            }
    
        a = resp.html.find('#top_earlier', first=True)
        if not a:
            break
    
        next_url = 'https://pinboard.in' + a.attrs['href']
        if pinboard_url == next_url:
            break
        
        time.sleep(1)
        pinboard_url = next_url


# In[2]:


next(pinboard('covid-19'))


# Now we can write all the results to a CSV file. But lets look for a few variants: covid-19, covid_19, covid19. To avoid repeating the same urls we can keep track of them and only write them once.

# In[4]:


import csv

urls_seen = set()
with open('data/pinboard.csv', 'w') as fh:
    out = csv.DictWriter(fh, fieldnames=['url', 'created', 'title'])
    out.writeheader()
    for hashtag in ['covid-19', 'covid_19', 'covid19']:
        for bookmark in pinboard(hashtag):
            if bookmark['url'] not in urls_seen:
                out.writerow(bookmark)
                urls_seen.add(bookmark['url'])            


# In[5]:


import pandas

# prevent dataframe columns from being truncated
pandas.set_option('display.max_columns', None)
pandas.set_option('display.width', None)
pandas.set_option('display.max_colwidth', None)

df = pandas.read_csv('data/pinboard.csv')
df


# Just out of curiousity is there currently any overlap with the IIPC seeds?

# In[8]:


iipc = pandas.read_csv('data/iipc.csv')
overlap = set(iipc.url).intersection(set(df.url))
overlap


# Nice, there are a few!

# In[ ]: