JournalCrawler
(soup)¶You can create a new JournalCrawler
whose crawl_type
is "soup".
import re
import time
import requests
import urllib
import pandas as pd
from bs4 import BeautifulSoup
from gummy.utils import whichJournal, canonicalize, get_driver, find_text, split_soup, split_soup_by_name
[success] local driver can be built. [failure] remote driver can't be built. DRIVER_TYPE: local
def get_soup(url):
return BeautifulSoup(requests.get(url).content, "html.parser")
def get_soup_driver(url):
with get_driver() as driver:
driver.get(url)
time.sleep(3)
html = driver.page_source.encode("utf-8")
return BeautifulSoup(html, "html.parser")
get_contents_soup
¶url = "https://www.google.com/"
soup = get_soup(url)
soup_driver = get_soup_driver(url)
DRIVER_TYPE: local
get_title_from_soup
¶def get_title_from_soup(self, soup):
title = find_text(soup=soup, name="h1", class_="title", strip=True)
return title
find_text(soup=soup, name="h1", class_="title", strip=True)
'[NOT FOUND]'
get_sections_from_soup
¶def get_sections_from_soup(self, soup):
sections = soup.find_all(name="h2", class_="section-title")
return sections
soup.find_all(name="h2", class_="section-title")
[]
soup_driver.find_all(name="h2", class_="section-title")
[]
get_sections_from_soup
¶def get_sections_from_soup(self, soup):
sections = soup.find_all(name="section", attrs={"type" : "other"})
abst = soup.find(name="div", class_="art-abstract")
if abst is not None:
asbt_section = soup.new_tag(name="section", attrs={"type" : "other"})
asbt_h2Tag = soup.new_tag(name="h2")
asbt_h2Tag.string = "0. Abstract"
asbt_section.append(asbt_h2Tag)
asbt_section.append(abst)
sections.insert(0, asbt_section)
sections = [e for e in soup.find_all(name="section") if e.get("aria-labelledby") not in self.AvoidAriaLabel]
return sections