Make a new `JournalCrawler` (soup)¶

You can create a new JournalCrawler whose crawl_type is "soup".

In [1]:

import re
import time
import requests
import urllib
import pandas as pd
from bs4 import BeautifulSoup
from gummy.utils import whichJournal, canonicalize, get_driver, find_text, split_soup, split_soup_by_name

[success] local driver can be built.
[failure] remote driver can't be built.
DRIVER_TYPE: local

In [2]:

def get_soup(url):
    return BeautifulSoup(requests.get(url).content, "html.parser")

In [3]:

def get_soup_driver(url):
    with get_driver() as driver:
        driver.get(url)
        time.sleep(3)
        html = driver.page_source.encode("utf-8")
    return BeautifulSoup(html, "html.parser")

create `get_contents_soup`¶

In [4]:

url = "https://www.google.com/"

In [5]:

soup = get_soup(url)
soup_driver = get_soup_driver(url)

DRIVER_TYPE: local

`get_title_from_soup`¶

def get_title_from_soup(self, soup):
        title = find_text(soup=soup, name="h1", class_="title", strip=True)
        return title

In [6]:

find_text(soup=soup, name="h1", class_="title", strip=True)

Out[6]:

'[NOT FOUND]'

`get_sections_from_soup`¶

def get_sections_from_soup(self, soup):
    sections = soup.find_all(name="h2", class_="section-title")
    return sections

In [7]:

soup.find_all(name="h2", class_="section-title")

Out[7]:

[]

In [8]:

soup_driver.find_all(name="h2", class_="section-title")

Out[8]:

[]

`get_sections_from_soup`¶

def get_sections_from_soup(self, soup):
        sections = soup.find_all(name="section", attrs={"type" : "other"})
        abst = soup.find(name="div", class_="art-abstract")
        if abst is not None:
            asbt_section = soup.new_tag(name="section", attrs={"type" : "other"})
            asbt_h2Tag = soup.new_tag(name="h2")
            asbt_h2Tag.string = "0. Abstract"
            asbt_section.append(asbt_h2Tag)
            asbt_section.append(abst)
            sections.insert(0, asbt_section)
        sections = [e for e in soup.find_all(name="section") if e.get("aria-labelledby") not in self.AvoidAriaLabel]
        return sections

In [ ]:

Make a new JournalCrawler (soup)¶

create get_contents_soup¶

get_title_from_soup¶

get_sections_from_soup¶

get_sections_from_soup¶

Make a new `JournalCrawler` (soup)¶

create `get_contents_soup`¶

`get_title_from_soup`¶

`get_sections_from_soup`¶

`get_sections_from_soup`¶