#!/usr/bin/env python # coding: utf-8 # # Pinboard # # Pinboard is a social bookmarking site where people share links to content and *tag* them by assigning a word that describes the content. These tags are free-form, and each user decides which ones to use. # # Pinboard has a nice [API](https://pinboard.in/api/) for interacting with your own bookmarks, but not for getting all public bookmarks for a tag. Pinboard also makes all tag pages available as RSS, e.g. https://feeds.pinboard.in/rss/t:covid-19 but it unfortunately doesn't allow paging back in time. # # So unfortunately we're going to have to scrape the pages. But fortunately this won't be too difficult with the [requests_html](https://requests-html.kennethreitz.org/) module because Pinboard has done such a nice job of using [semantic html](https://en.wikipedia.org/wiki/Semantic_HTML). # In[1]: import time import requests_html import dateutil.parser def pinboard(hashtag): http = requests_html.HTMLSession() pinboard_url = 'https://pinboard.in/t:{}'.format(hashtag) while True: resp = http.get(pinboard_url) bookmarks = resp.html.find('.bookmark') for b in bookmarks: a = b.find('.bookmark_title', first=True) yield { 'url': a.attrs['href'], 'title': a.text, 'created': dateutil.parser.parse(b.find('.when', first=True).attrs['title']) } a = resp.html.find('#top_earlier', first=True) if not a: break next_url = 'https://pinboard.in' + a.attrs['href'] if pinboard_url == next_url: break time.sleep(1) pinboard_url = next_url # In[2]: next(pinboard('covid-19')) # Now we can write all the results to a CSV file. But lets look for a few variants: covid-19, covid_19, covid19. To avoid repeating the same urls we can keep track of them and only write them once. # In[4]: import csv urls_seen = set() with open('data/pinboard.csv', 'w') as fh: out = csv.DictWriter(fh, fieldnames=['url', 'created', 'title']) out.writeheader() for hashtag in ['covid-19', 'covid_19', 'covid19']: for bookmark in pinboard(hashtag): if bookmark['url'] not in urls_seen: out.writerow(bookmark) urls_seen.add(bookmark['url']) # In[5]: import pandas # prevent dataframe columns from being truncated pandas.set_option('display.max_columns', None) pandas.set_option('display.width', None) pandas.set_option('display.max_colwidth', None) df = pandas.read_csv('data/pinboard.csv') df # Just out of curiousity is there currently any overlap with the IIPC seeds? # In[8]: iipc = pandas.read_csv('data/iipc.csv') overlap = set(iipc.url).intersection(set(df.url)) overlap # Nice, there are a few! # In[ ]: