import BeautifulSoup as bs
from IPython.display import HTML
import urllib2
import re
url = 'http://pt.wikipedia.org/w/index.php?title=Wikip%C3%A9dia:Esplanada/geral&action=history'
headers = { 'User-Agent' : 'Mozilla/5.0' }
req = urllib2.Request(url, None, headers)
html = urllib2.urlopen(req).read()
soup = bs.BeautifulSoup(html)
topics = soup.findAll('li', text=re.compile(u'\(novo tópico:*'))
topics_l = []
for topic in topics:
topics_l.append({})
t = topic.findParent()
topics_l[-1]['title'] = t.findAll('a')[1]
topics_l[-1]['author'] = t.findParent().find('span', attrs={'class': 'history-user'}).a
topics_l[-1]['date'] = t.findParent().find('a', attrs={'class': 'mw-changeslist-date'})
def html_new_topics(topics):
html_list = '
{} novos tópicos
'.format(len(topics))
for topic in topics:
html_list += '- '
for k in topic.keys():
topic[k]['href'] = 'http://pt.wikipedia.org' + str(topic[k]['href'])
html_list += str(topic['title']) + ' - ' + str(topic['author']) + ' - ' + str(topic['date'])
html_list += '
'
html_list += '
'
return html_list
HTML(html_new_topics(topics_l))
url = topics_l[0]['title']['href']
headers = { 'User-Agent' : 'Mozilla/5.0' }
req = urllib2.Request(url, None, headers)
html = urllib2.urlopen(req).read()
soup = bs.BeautifulSoup(html)
len(cont_div.findAll(text=True))
cont_div = soup.find('div', attrs={'id': 'mw-content-text'})
for i in cont_div:
if type(i) == bs.Tag:
if i.name != 'table' and i.name != 'dl':
print i
if i.name == 'dl':
break
def html_new_topics(topics, content=False):
html_list = '{} novos tópicos
'.format(len(topics))
for topic in topics:
if content:
url = topic['title']['href']
headers = { 'User-Agent' : 'Mozilla/5.0' }
req = urllib2.Request(url, None, headers)
html = urllib2.urlopen(req).read()
soup = bs.BeautifulSoup(html)
cont_div = soup.find('div', attrs={'id': 'mw-content-text'})
topic_content = ''
for i in cont_div:
if type(i) == bs.Tag:
if i.name != 'table' and i.name != 'dl':
topic_content += str(i)
if i.name == 'dl':
break
html_list += '- '
html_list += '{} - {} - {}'.format(topic['title'], topic['author'], topic['date'])
if content:
html_list += '
' + topic_content
html_list += ' '
html_list += '
'
return html_list
HTML(html_new_topics(topics_l, content=True))