#!/usr/bin/env python # coding: utf-8 # # Robo-MZOIP - ocjene prihvatljivosti za ekološku mrežu Natura 2000 # ## Uvod # # ### Natura 2000 # # Zakonski okvir: EU Direktiva o pticama (79/409/EEC), Direktiva o staništima (92/43/EEC) # # http://ec.europa.eu/environment/nature/legislation/birdsdirective/index_en.htm # # http://ec.europa.eu/environment/nature/legislation/habitatsdirective/index_en.htm # # * Područja očuvanja značajna za ptice # * Područja očuvanja značajna za vrste i staništa # # * Postupak ocjene prihvatljivosti za ekološku mrežu # - Prethodna ocjena # - Glavna ocjena # # ## Problem # # * Nepregledna stranica Ministarstva zaštite okoliša i energetike. # * Dokumenti se dodaju na način da je nemoguće sustavno pratiti njihovo dodavanje. # # ## Rješenje # # * Web scraper (python3 + requests + beautiful soup 4) # * Twitter # In[136]: import argparse from datetime import datetime import os import re import sys import requests from bs4 import BeautifulSoup import pandas as pd # Definicija stranica koja nas zanimaju: # In[137]: BASE_URL_MZOE = 'http://mzoe.hr' BASE_URL_OPEM = 'http://mzoe.hr/hr/priroda/ekoloska-mreza-natura-2000.html' # Dohvaćanje html stranice, i kreiranje `soup` objekta sa stranice Ministarstva: # In[138]: def get_soup(full_url): """Dohvaca soup objekt s mreznih stranica. Args: full_url (str): path do stranice. Returns: soup: beautifulsoup object. """ r = requests.get(full_url) soup = BeautifulSoup(r.content, 'lxml') return soup # In[139]: soup = get_soup(BASE_URL_OPEM) # ### Glavne ocjene u nadležnosti Ministarstva # In[140]: opem = soup.find('div', 'accordion').find_all('div', 'accordion') a = opem[0].find_all('a') # In[141]: print(a[0]) print(a[4]) # In[142]: print(a[0].parent.parent.parent.previous_sibling.previous_sibling.get_text().strip()) print(BASE_URL_MZOE + a[0].get('href')) print(a[4].parent.parent.parent.previous_sibling.previous_sibling.get_text().strip()) print(BASE_URL_MZOE + a[4].get('href')) # In[143]: opem_raw = [] for link in a: red = [link.parent.parent.parent.previous_sibling.previous_sibling.get_text().strip(), link.get_text(), BASE_URL_MZOE + link.get('href')] opem_raw.append(red) opem_pd = pd.DataFrame(opem_raw) # In[144]: opem_pd.head(10) # ### Glavne ocjene u nadležnosti županija/grada Zagreba # In[145]: a1 = opem[1].find_all('a') opem_z_raw = [] for link in a1: red = [link.parent.parent.parent.previous_sibling.previous_sibling.get_text().strip(), link.get_text(), BASE_URL_MZOE + link.get('href')] opem_z_raw.append(red) opem_z_pd = pd.DataFrame(opem_z_raw) # In[146]: opem_z_pd.head(10) # ### Prethodne ocjene prihvatljivosti za ekološku mrežu # In[147]: prethodne = soup.find_all('div', 'accordion')[3] # In[148]: godpattern = re.compile(r".*?(\d{4}).*") # In[149]: prethodne_god = prethodne.find_all('h3') prethodne_raw = [] for god in prethodne_god: dokumenti = god.next_sibling.next_sibling.find_all('a') for dok in dokumenti: red = [re.sub(godpattern, "\\1", god.get_text().strip()), dok.get_text(), BASE_URL_MZOE + dok.get('href')] prethodne_raw.append(red) prethodne_pd = pd.DataFrame(prethodne_raw) # In[150]: prethodne_pd.head(10) # ## To do # # * Snimanje u .tsv tablice # * Integracija u `puobot.py` # * Twitter upload