In [1]:
import re
import string
import aiohttp
import asyncio
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

CONCURRENT_REQUESTS = 20
In [2]:
@asyncio.coroutine
def fetch(url, semaphore):
    try:
        with (yield from semaphore):
            with aiohttp.Timeout(60):
                response = yield from aiohttp.get(url)
                html = yield from response.text(encoding='ISO-8859-1')
                return html
    except asyncio.TimeoutError:
        print('Timeout: {}'.format(url))
        return (yield from fetch(url, semaphore))

@asyncio.coroutine
def get_soup(url, semaphore):
    with (yield from semaphore):
        html = yield from fetch(url, semaphore)
        return (BeautifulSoup(html, 'html.parser'), url)
    
# Create the author list
@asyncio.coroutine
def extract_author_nums(urls, semaphore):
    out_list = []
    for item in tqdm(asyncio.as_completed([get_soup(u, semaphore) for u in urls]), total=len(urls)):
        soup, outurl = yield from item
        for link in soup.findAll('div', align='left'):
            out_list.append(re.findall(r'id=(\d*)', link.a.get('href'))[0])
    return out_list

list_of_urls = []
for letter in string.ascii_uppercase:
    list_of_urls.append('http://visionsciences1.org/vss_public/mobile/author_index.php?view={}#overview'.format(letter))

semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
loop = asyncio.get_event_loop()
author_nums = loop.run_until_complete(extract_author_nums(list_of_urls, semaphore))
author_nums = list(set(author_nums))
100%|██████████| 26/26 [00:02<00:00,  7.76it/s]
In [3]:
response = requests.get('http://visionsciences1.org/vss_public/mobile/authors_index_list.php?id=64245#overview')
soup = BeautifulSoup(response.text, 'html.parser')
In [4]:
list_of_urls = []
for author in author_nums:
    list_of_urls.append('http://visionsciences1.org/vss_public/mobile/authors_index_list.php?id={}#overview'.format(author))

@asyncio.coroutine
def extract_abstract_nums(urls, semaphore):
    abstracts = {}
    for item in tqdm(asyncio.as_completed([get_soup(u, semaphore) for u in urls]), total=len(urls)):
        soup, outurl = yield from item
        for link in soup.find_all('a', href=re.compile('abstractno='))[::2]:
            abstract_num = int(re.findall(r'abstractno=(\d*)', link.get('href'))[0])
            abstracts[abstract_num] = {}
            abstracts[abstract_num]['id'] = link.text.strip()
    return abstracts

semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
loop = asyncio.get_event_loop()
abstracts = loop.run_until_complete(extract_abstract_nums(list_of_urls, semaphore))
100%|██████████| 3500/3500 [01:17<00:00, 47.21it/s]
In [5]:
list_of_urls = []
for abstract_num in abstracts:
    list_of_urls.append('http://visionsciences1.org/vss_public/core_routines/view_abstract_no.php?abstractno={}'.format(abstract_num))

@asyncio.coroutine
def extract_abstracts(urls, semaphore):
    out_dict = {}
    for item in tqdm(asyncio.as_completed([get_soup(u, semaphore) for u in urls]), total=len(urls)):
        soup, outurl = yield from item
        abstract = int(re.findall('abstractno=(\d*)', outurl)[0])
        out_dict[abstract] = {}
        out_dict[abstract]['title'] = soup.table.findAll('tr')[0].text.strip()
        authinfo = soup.table.findAll('tr')[1]
        [t.extract() for t in authinfo.findAll('sup')]
        authinfo = str(authinfo).split('<br>')
        authors = BeautifulSoup(authinfo[0], 'html.parser').text.strip().split(', ')
        out_dict[abstract]['author'] = [" ".join(a.split()) for a in authors]
        out_dict[abstract]['affiliation'] = BeautifulSoup('\n'.join(authinfo[1:]), 'html.parser').text.strip().split('\n')
        soup.i.extract()
        out_dict[abstract]['abstract'] = soup.text.strip()
    return out_dict

semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
loop = asyncio.get_event_loop()
abstract_contents = loop.run_until_complete(extract_abstracts(list_of_urls, semaphore))
for key in abstracts:
    abstracts[key].update(abstract_contents[key])
100%|██████████| 1459/1459 [00:18<00:00, 77.36it/s]
In [6]:
import json
with open('visvssrelationships_data_2016.json', 'w') as f:
    json.dump(abstracts, f, indent=4)
In [ ]: