import re
import string
import aiohttp
import asyncio
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
CONCURRENT_REQUESTS = 20
@asyncio.coroutine
def fetch(url, semaphore):
try:
with (yield from semaphore):
with aiohttp.Timeout(60):
response = yield from aiohttp.get(url)
html = yield from response.text(encoding='ISO-8859-1')
return html
except asyncio.TimeoutError:
print('Timeout: {}'.format(url))
return (yield from fetch(url, semaphore))
@asyncio.coroutine
def get_soup(url, semaphore):
with (yield from semaphore):
html = yield from fetch(url, semaphore)
return (BeautifulSoup(html, 'html.parser'), url)
# Create the author list
@asyncio.coroutine
def extract_author_nums(urls, semaphore):
out_list = []
for item in tqdm(asyncio.as_completed([get_soup(u, semaphore) for u in urls]), total=len(urls)):
soup, outurl = yield from item
for link in soup.findAll('div', align='left'):
out_list.append(re.findall(r'id=(\d*)', link.a.get('href'))[0])
return out_list
list_of_urls = []
for letter in string.ascii_uppercase:
list_of_urls.append('http://visionsciences1.org/vss_public/mobile/author_index.php?view={}#overview'.format(letter))
semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
loop = asyncio.get_event_loop()
author_nums = loop.run_until_complete(extract_author_nums(list_of_urls, semaphore))
author_nums = list(set(author_nums))
100%|██████████| 26/26 [00:02<00:00, 7.76it/s]
response = requests.get('http://visionsciences1.org/vss_public/mobile/authors_index_list.php?id=64245#overview')
soup = BeautifulSoup(response.text, 'html.parser')
list_of_urls = []
for author in author_nums:
list_of_urls.append('http://visionsciences1.org/vss_public/mobile/authors_index_list.php?id={}#overview'.format(author))
@asyncio.coroutine
def extract_abstract_nums(urls, semaphore):
abstracts = {}
for item in tqdm(asyncio.as_completed([get_soup(u, semaphore) for u in urls]), total=len(urls)):
soup, outurl = yield from item
for link in soup.find_all('a', href=re.compile('abstractno='))[::2]:
abstract_num = int(re.findall(r'abstractno=(\d*)', link.get('href'))[0])
abstracts[abstract_num] = {}
abstracts[abstract_num]['id'] = link.text.strip()
return abstracts
semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
loop = asyncio.get_event_loop()
abstracts = loop.run_until_complete(extract_abstract_nums(list_of_urls, semaphore))
100%|██████████| 3500/3500 [01:17<00:00, 47.21it/s]
list_of_urls = []
for abstract_num in abstracts:
list_of_urls.append('http://visionsciences1.org/vss_public/core_routines/view_abstract_no.php?abstractno={}'.format(abstract_num))
@asyncio.coroutine
def extract_abstracts(urls, semaphore):
out_dict = {}
for item in tqdm(asyncio.as_completed([get_soup(u, semaphore) for u in urls]), total=len(urls)):
soup, outurl = yield from item
abstract = int(re.findall('abstractno=(\d*)', outurl)[0])
out_dict[abstract] = {}
out_dict[abstract]['title'] = soup.table.findAll('tr')[0].text.strip()
authinfo = soup.table.findAll('tr')[1]
[t.extract() for t in authinfo.findAll('sup')]
authinfo = str(authinfo).split('<br>')
authors = BeautifulSoup(authinfo[0], 'html.parser').text.strip().split(', ')
out_dict[abstract]['author'] = [" ".join(a.split()) for a in authors]
out_dict[abstract]['affiliation'] = BeautifulSoup('\n'.join(authinfo[1:]), 'html.parser').text.strip().split('\n')
soup.i.extract()
out_dict[abstract]['abstract'] = soup.text.strip()
return out_dict
semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
loop = asyncio.get_event_loop()
abstract_contents = loop.run_until_complete(extract_abstracts(list_of_urls, semaphore))
for key in abstracts:
abstracts[key].update(abstract_contents[key])
100%|██████████| 1459/1459 [00:18<00:00, 77.36it/s]
import json
with open('visvssrelationships_data_2016.json', 'w') as f:
json.dump(abstracts, f, indent=4)