A megadott induló linken keresztül egy WEB lapértlapmezés után a linkekk (külső vagy belső) kigyüjtése és kiiratása a képernyőre
from urllib.request import urlopen
from bs4 import BeautifulSoup
szamlal=1
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
szamlal += 1
if 'href' in link.attrs:
print(link.attrs['href'])
if szamlal >=20: ## Csak 20 db kilistázása
break
/wiki/Wikipedia:Protection_policy#semi #mw-head #p-search /wiki/Kevin_Bacon_(disambiguation) /wiki/File:Kevin_Bacon_SDCC_2014.jpg /wiki/Philadelphia /wiki/Pennsylvania /wiki/Kyra_Sedgwick /wiki/Sosie_Bacon #cite_note-1 /wiki/Edmund_Bacon_(architect) /wiki/Michael_Bacon_(musician) /wiki/Holly_Near http://baconbros.com/ #cite_note-2 #cite_note-actor-3 /wiki/Footloose_(1984_film) /wiki/JFK_(film)
A linkek elemzése helyi domén elemzés (a Wikin belülre mutató linkek)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
szamlal=1
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all(
'a', href=re.compile('^(/wiki/)((?!:).)*$')):
szamlal += 1
if 'href' in link.attrs:
print(link.attrs['href'])
if szamlal >=20: ## Csak 20 db kilistázása
break
/wiki/Kevin_Bacon_(disambiguation) /wiki/Philadelphia /wiki/Pennsylvania /wiki/Kyra_Sedgwick /wiki/Sosie_Bacon /wiki/Edmund_Bacon_(architect) /wiki/Michael_Bacon_(musician) /wiki/Holly_Near /wiki/Footloose_(1984_film) /wiki/JFK_(film) /wiki/A_Few_Good_Men /wiki/Apollo_13_(film) /wiki/Mystic_River_(film) /wiki/Sleepers /wiki/The_Woodsman_(2004_film) /wiki/He_Said,_She_Said_(film) /wiki/Fox_Broadcasting_Company /wiki/The_Following /wiki/HBO
A beső cikkek linkjeinek vélbejárása és kiíratása
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re
szamlal=1
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
bs = BeautifulSoup(html, 'html.parser')
return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
print(newArticle)
szamlal += 1
if szamlal >=20: ## Csak 20 db kilistázása
break
links = getLinks(newArticle)
/wiki/Golden_Globe_Award /wiki/To_Kill_a_Mockingbird_(film) /wiki/La_Jolla,_California /wiki/Klamath_Basin /wiki/Crater_Lake /wiki/Rhyodacite /wiki/Latite /wiki/Petrology /wiki/Seismology /wiki/Volcanology /wiki/INGV /wiki/CNN /wiki/Warner_Bros._International_Television_Production /wiki/DirecTV-5 /wiki/AT%26T_satellite_fleet /wiki/Red_by_HBO /wiki/Sky_News_International /wiki/Digital_television_in_the_United_Kingdom /wiki/Royal_Television_Society
Önmeghívó eljárással bejárluk az egészet és az új lapokat eltároljuk egy pages változóba
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
szamlal=1
pages = set()
def getLinks(pageUrl):
global pages, szamlal
html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
szamlal += 1
if szamlal >=20: ## Csak 20 db kilistázása
break
#We have encountered a new page
newPage = link.attrs['href']
print(newPage)
pages.add(newPage)
getLinks(newPage)
getLinks('')
/wiki/Wikipedia /wiki/Wikipedia:Protection_policy#semi /wiki/Wikipedia:Requests_for_page_protection /wiki/Wikipedia:Protection_policy#move /wiki/Wikipedia:Lists_of_protected_pages /wiki/Wikipedia:Protection_policy /wiki/Wikipedia:Perennial_proposals /wiki/Wikipedia:Reliable_sources/Perennial_sources /wiki/Wikipedia:Reliable_sources /wiki/Wikipedia:WikiProject_Reliability /wiki/Wikipedia:WRE /wiki/File:People_icon.svg /wiki/Special:WhatLinksHere/File:People_icon.svg /wiki/Help:What_links_here /wiki/Wikipedia:Project_namespace#How-to_and_information_pages /wiki/Wikipedia:Policies_and_guidelines /wiki/Wikipedia:WikiProject_Politics /wiki/File:A_coloured_voting_box.svg
Hibakezeléssel bejárjuk a WEB lapokat és az új lapok tartalmát listához adjuk.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
szamlal=1
pages = set()
def getLinks(pageUrl):
global pages, szamlal
html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
bs = BeautifulSoup(html, 'html.parser')
try:
print(bs.h1.get_text())
print(bs.find(id ='mw-content-text').find_all('p')[0])
print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
except AttributeError:
print('This page is missing something! Continuing.')
for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
szamlal += 1
if szamlal >=5: ## Csak 5 db kilistázása
break
#We have encountered a new page
newPage = link.attrs['href']
print('-'*20)
print(newPage)
pages.add(newPage)
getLinks(newPage)
getLinks('')
Main Page <p><b><a href="/wiki/Jean-Fran%C3%A7ois-Marie_de_Surville" title="Jean-François-Marie de Surville">Jean-François-Marie de Surville</a></b> (1717–1770) was a merchant captain with the <a href="/wiki/French_East_India_Company" title="French East India Company">French East India Company</a> who commanded a voyage of exploration to the Pacific in 1769 and 1770. Born in <a href="/wiki/Brittany" title="Brittany">Brittany</a>, France, Surville joined the company when he was 10 years old. For the next several years, he sailed on voyages in Indian and Chinese waters. In 1740, he joined the <a href="/wiki/French_Navy" title="French Navy">French Navy</a>. He fought in the <a href="/wiki/War_of_the_Austrian_Succession" title="War of the Austrian Succession">War of the Austrian Succession</a> and the <a href="/wiki/Seven_Years%27_War" title="Seven Years' War">Seven Years' War</a>, twice becoming a prisoner of war. In 1769, in command of <i>Saint Jean-Baptiste</i>, he sailed from India on an expedition to the Pacific looking for trading opportunities. He explored the seas around the <a href="/wiki/Solomon_Islands" title="Solomon Islands">Solomon Islands</a> and anchored in December at <a href="/wiki/Doubtless_Bay" title="Doubtless Bay">Doubtless Bay</a>, New Zealand <i>(commemorative plaque pictured)</i>. Part of his route around New Zealand overlapped that of <a href="/wiki/James_Cook" title="James Cook">James Cook</a> in <a href="/wiki/HMS_Endeavour" title="HMS Endeavour"><i>Endeavour</i></a>, who had preceded him by only a few days. Three months later, Surville drowned off the coast of Peru while seeking help for his <a href="/wiki/Scurvy" title="Scurvy">scurvy</a>-afflicted crew. (<b><a href="/wiki/Jean-Fran%C3%A7ois-Marie_de_Surville" title="Jean-François-Marie de Surville">Full article...</a></b>) </p> This page is missing something! Continuing. -------------------- /wiki/Wikipedia Wikipedia <p class="mw-empty-elt"> </p> This page is missing something! Continuing. -------------------- /wiki/Wikipedia:Protection_policy#semi Wikipedia:Protection policy <p class="mw-empty-elt"> </p> This page is missing something! Continuing. -------------------- /wiki/Wikipedia:Requests_for_page_protection Wikipedia:Requests for page protection <p>This page is for requesting that a page, file or template be <b> fully protected</b>, <b>create protected</b> (<a href="/wiki/Wikipedia:Protection_policy#Creation_protection" title="Wikipedia:Protection policy">salted</a>), <b>extended confirmed protected</b>, <b>semi-protected</b>, added to <b>pending changes</b>, <b>move-protected</b>, <b>template protected</b>, <b>upload protected</b> (file-specific), or <b>unprotected</b>. Please read up on the <a href="/wiki/Wikipedia:Protection_policy" title="Wikipedia:Protection policy">protection policy</a>. Full protection is used to stop edit warring between multiple users or to prevent vandalism to <a href="/wiki/Wikipedia:High-risk_templates" title="Wikipedia:High-risk templates">high-risk templates</a>; semi-protection and pending changes are usually used only to prevent IP and new user vandalism (see the <a href="/wiki/Wikipedia:Rough_guide_to_semi-protection" title="Wikipedia:Rough guide to semi-protection">rough guide to semi-protection</a>); and move protection is used to stop <a href="/wiki/Wikipedia:Page-move_war" title="Wikipedia:Page-move war">page-move wars</a>. Extended confirmed protection is used where semi-protection has proved insufficient (see the <a href="/wiki/Wikipedia:Rough_guide_to_extended_confirmed_protection" title="Wikipedia:Rough guide to extended confirmed protection">rough guide to extended confirmed protection</a>) </p> This page is missing something! Continuing.
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
szamlal=1
#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, includeUrl):
includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
internalLinks = []
#Finds all links that begin with a "/"
for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if(link.attrs['href'].startswith('/')):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
#Retrieves a list of all external links found on a page
def getExternalLinks(bs, excludeUrl):
global szamlal
externalLinks = []
#Finds all links that start with "http" that do
#not contain the current URL
for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
szamlal += 1
if szamlal >= 5: ## Csak 5 db kilistázása
break
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def getRandomExternalLink(startingPage):
try:
html = urlopen(startingPage)
bs = BeautifulSoup(html, 'html.parser')
externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print('No external links, looking around the site for one')
domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
internalLinks = getInternalLinks(bs, domain)
return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]
except Exception:
return None
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
if externalLink == None:
quit
else:
print('Random external link is: {}'.format(externalLink))
followExternalOnly(externalLink)
followExternalOnly('https://klajosw.blogspot.com/p/kezdolap.html')
Random external link is: https://klajosw.blogspot.hu/ No external links, looking around the site for one
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
allExtLinks = set()
allIntLinks = set()
szamlal=1
def getAllExternalLinks(siteUrl):
global szamlal
html = urlopen(siteUrl)
domain = '{}://{}'.format(urlparse(siteUrl).scheme,
urlparse(siteUrl).netloc)
bs = BeautifulSoup(html, 'html.parser')
internalLinks = getInternalLinks(bs, domain)
externalLinks = getExternalLinks(bs, domain)
for link in externalLinks:
szamlal += 1
if szamlal >= 10: ## Csak 10 db kilistázása
break
if link not in allExtLinks:
allExtLinks.add(link)
print('Új külsö link : ', link)
for link in internalLinks:
szamlal += 1
if szamlal >= 10: ## Csak 10 db kilistázása
break
if link not in allIntLinks:
print('Új belső link : ', link)
allIntLinks.add(link)
getAllExternalLinks(link)
allIntLinks.add('https://klajosw.blogspot.com')
getAllExternalLinks('https://klajosw.blogspot.com/p/kezdolap.html')
Új külsö link : https://klajosw.blogspot.com/ Új külsö link : https://mierdekel.hu/ Új külsö link : https://klajosw.blogspot.hu/ Új belső link : https://klajosw.blogspot.com/