#!/usr/bin/env python # coding: utf-8 # In[1]: import re import string import aiohttp import asyncio import requests from bs4 import BeautifulSoup from tqdm import tqdm CONCURRENT_REQUESTS = 20 # In[2]: @asyncio.coroutine def fetch(url, semaphore): try: with (yield from semaphore): with aiohttp.Timeout(60): response = yield from aiohttp.get(url) html = yield from response.text(encoding='ISO-8859-1') return html except asyncio.TimeoutError: print('Timeout: {}'.format(url)) return (yield from fetch(url, semaphore)) @asyncio.coroutine def get_soup(url, semaphore): with (yield from semaphore): html = yield from fetch(url, semaphore) return (BeautifulSoup(html, 'html.parser'), url) # Create the author list @asyncio.coroutine def extract_author_nums(urls, semaphore): out_list = [] for item in tqdm(asyncio.as_completed([get_soup(u, semaphore) for u in urls]), total=len(urls)): soup, outurl = yield from item for link in soup.findAll('div', align='left'): out_list.append(re.findall(r'id=(\d*)', link.a.get('href'))[0]) return out_list list_of_urls = [] for letter in string.ascii_uppercase: list_of_urls.append('http://visionsciences1.org/vss_public/mobile/author_index.php?view={}#overview'.format(letter)) semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS) loop = asyncio.get_event_loop() author_nums = loop.run_until_complete(extract_author_nums(list_of_urls, semaphore)) author_nums = list(set(author_nums)) # In[3]: response = requests.get('http://visionsciences1.org/vss_public/mobile/authors_index_list.php?id=64245#overview') soup = BeautifulSoup(response.text, 'html.parser') # In[4]: list_of_urls = [] for author in author_nums: list_of_urls.append('http://visionsciences1.org/vss_public/mobile/authors_index_list.php?id={}#overview'.format(author)) @asyncio.coroutine def extract_abstract_nums(urls, semaphore): abstracts = {} for item in tqdm(asyncio.as_completed([get_soup(u, semaphore) for u in urls]), total=len(urls)): soup, outurl = yield from item for link in soup.find_all('a', href=re.compile('abstractno='))[::2]: abstract_num = int(re.findall(r'abstractno=(\d*)', link.get('href'))[0]) abstracts[abstract_num] = {} abstracts[abstract_num]['id'] = link.text.strip() return abstracts semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS) loop = asyncio.get_event_loop() abstracts = loop.run_until_complete(extract_abstract_nums(list_of_urls, semaphore)) # In[5]: list_of_urls = [] for abstract_num in abstracts: list_of_urls.append('http://visionsciences1.org/vss_public/core_routines/view_abstract_no.php?abstractno={}'.format(abstract_num)) @asyncio.coroutine def extract_abstracts(urls, semaphore): out_dict = {} for item in tqdm(asyncio.as_completed([get_soup(u, semaphore) for u in urls]), total=len(urls)): soup, outurl = yield from item abstract = int(re.findall('abstractno=(\d*)', outurl)[0]) out_dict[abstract] = {} out_dict[abstract]['title'] = soup.table.findAll('tr')[0].text.strip() authinfo = soup.table.findAll('tr')[1] [t.extract() for t in authinfo.findAll('sup')] authinfo = str(authinfo).split('
') authors = BeautifulSoup(authinfo[0], 'html.parser').text.strip().split(', ') out_dict[abstract]['author'] = [" ".join(a.split()) for a in authors] out_dict[abstract]['affiliation'] = BeautifulSoup('\n'.join(authinfo[1:]), 'html.parser').text.strip().split('\n') soup.i.extract() out_dict[abstract]['abstract'] = soup.text.strip() return out_dict semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS) loop = asyncio.get_event_loop() abstract_contents = loop.run_until_complete(extract_abstracts(list_of_urls, semaphore)) for key in abstracts: abstracts[key].update(abstract_contents[key]) # In[6]: import json with open('visvssrelationships_data_2016.json', 'w') as f: json.dump(abstracts, f, indent=4) # In[ ]: