import requests
from bs4 import BeautifulSoup
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/ width=1000 height=500></iframe>')
# the webpage we would like to crawl
<td width="274" class="bl">· <a href="./d12qgrdzfbg/201603/t20160318_369509.html" target="_blank" title="2016年政府工作报告">2016年政府工作报告</a></td>
# get the link for each year
url = "http://www.hprc.org.cn/wxzl/wxysl/lczf/"
content = requests.get(url)
content.encoding
'ISO-8859-1'
# Specify the encoding
content.encoding = 'gb18030'
content = content.text
soup = BeautifulSoup(content, 'html.parser')
# links = soup.find_all('td', {'class', 'bl'})
links = soup.select('.bl a')
links[0]
<a href="./d12qgrdzfbg/201703/t20170317_389845.html" target="_blank" title="2017年政府工作报告">2017年政府工作报告</a>
len(links)
48
links[-1]['href']
'./dishiyijie_10/200908/t20090818_27558.html'
links[0]['href'].split('./')[1]
'd12qgrdzfbg/201703/t20170317_389845.html'
url + links[0]['href'].split('./')[1]
'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201703/t20170317_389845.html'
hyperlinks = [url + i['href'].split('./')[1] for i in links]
hyperlinks[:5]
['http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201703/t20170317_389845.html', 'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html', 'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201503/t20150318_319434.html', 'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201403/t20140315_270863.html', 'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201402/t20140214_266528.html']
hyperlinks[-5:]
['http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_9/200908/t20090818_27563.html', 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27561.html', 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27560.html', 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27559.html', 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27558.html']
hyperlinks[10] # 2007年有分页
'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html \
width=1000 height=500></iframe>')
# 2007年有分页
url_i = 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'
content = requests.get(url_i)
content.encoding = 'gb18030'
content = content.text
#content = content.text.encode(content.encoding).decode('gb18030')
soup = BeautifulSoup(content, 'html.parser')
#scripts = soup.find_all('script')
#scripts[0]
scripts = soup.select('td script')[0]
scripts
<script> var currentPage = 0;//所在页从0开始 var prevPage = currentPage-1//上一页 var 下一页Page = currentPage+1//下一页 var countPage = 4//共多少页 //document.write("共"+countPage+"页 "); //循环 var num = 17; for(var i=0+(currentPage-1-(currentPage-1)%num) ; i<=(num+(currentPage-1-(currentPage-1)%num))&&(i<countPage) ; i++){ if(countPage >1){ if(currentPage==i) document.write("【<span style=\"color:#FF0000;\" class=\"hui14_30_h\">"+(i+1)+"</span>】 "); else if(i==0) document.write("<a href=\"t20090818_27775.html\" class=\"hui14_30_h\">【"+(i+1)+"】</a> "); else document.write("<a href=\"t20090818_27775"+"_" + i + "."+"html\" class=\"hui14_30_h\">【"+(i+1)+"】</a> "); } } document.write("<br><br>"); //设置上一页代码 if(countPage>1&¤tPage!=0&¤tPage!=1) document.write("<a href=\"t20090818_27775"+"_" + prevPage + "."+"html\"><span style=\"color:#0033FF;font-weight:bold\">上一页</span></a> "); else if(countPage>1&¤tPage!=0&¤tPage==1) document.write("<a href=\"t20090818_27775.html\"><span style=\"color:#0033FF;font-weight:bold\">上一页</span></a> "); //else // document.write("上一页 "); //设置下一页代码 if(countPage>1&¤tPage!=(countPage-1)) document.write("<a href=\"t20090818_27775"+"_" + 下一页Page + "."+"html\" ><span style=\"color:#0033FF;font-weight:bold\">下一页</span></a> "); //else // document.write("下一页 "); </script>
scripts.text
'\n\tvar currentPage = 0;//所在页从0开始\n\tvar prevPage = currentPage-1//上一页\n\tvar 下一页Page = currentPage+1//下一页\n\tvar countPage = 4//共多少页\n\t//document.write("共"+countPage+"页 ");\n\t\n\t//循环\n\tvar num = 17;\n\tfor(var i=0+(currentPage-1-(currentPage-1)%num) ; i<=(num+(currentPage-1-(currentPage-1)%num))&&(i<countPage) ; i++){\n\t\tif(countPage >1){\n\t\t\tif(currentPage==i)\n\t\t\t\tdocument.write("【<span style=\\"color:#FF0000;\\" class=\\"hui14_30_h\\">"+(i+1)+"</span>】 ");\n\t\t\telse if(i==0)\n\t\t\t\tdocument.write("<a href=\\"t20090818_27775.html\\" class=\\"hui14_30_h\\">【"+(i+1)+"】</a> ");\n\t\t\telse\n\t\t\t\tdocument.write("<a href=\\"t20090818_27775"+"_" + i + "."+"html\\" class=\\"hui14_30_h\\">【"+(i+1)+"】</a> ");\n\t\t}\t\n\t}\n\t\n\tdocument.write("<br><br>");\n\t//设置上一页代码\n\tif(countPage>1&¤tPage!=0&¤tPage!=1)\n\t\tdocument.write("<a href=\\"t20090818_27775"+"_" + prevPage + "."+"html\\"><span style=\\"color:#0033FF;font-weight:bold\\">上一页</span></a> ");\n\telse if(countPage>1&¤tPage!=0&¤tPage==1)\n\t\tdocument.write("<a href=\\"t20090818_27775.html\\"><span style=\\"color:#0033FF;font-weight:bold\\">上一页</span></a> ");\n\t//else\n\t//\tdocument.write("上一页 ");\n\t\n\t\n\t//设置下一页代码 \n\tif(countPage>1&¤tPage!=(countPage-1))\n\t\tdocument.write("<a href=\\"t20090818_27775"+"_" + 下一页Page + "."+"html\\" ><span style=\\"color:#0033FF;font-weight:bold\\">下一页</span></a> ");\n\t//else\n\t//\tdocument.write("下一页 ");\n\t\t\t\t\t \n\t'
# countPage = int(''.join(scripts).split('countPage = ')\
# [1].split('//')[0])
# countPage
countPage = int(scripts.text.split('countPage = ')[1].split('//')[0])
countPage
4
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
def crawler(url_i):
content = requests.get(url_i)
content.encoding = 'gb18030'
content = content.text
soup = BeautifulSoup(content, 'html.parser')
year = soup.find('span', {'class', 'huang16c'}).text[:4]
year = int(year)
report = ''.join(s.text for s in soup('p'))
# 找到分页信息
scripts = soup.find_all('script')
countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])
if countPage == 1:
pass
else:
for i in range(1, countPage):
url_child = url_i.split('.html')[0] +'_'+str(i)+'.html'
content = requests.get(url_child)
content.encoding = 'gb18030'
content = content.text
soup = BeautifulSoup(content, 'html.parser')
report_child = ''.join(s.text for s in soup('p'))
report = report + report_child
return year, report
# 抓取48年政府工作报告内容
reports = {}
for link in hyperlinks:
year, report = crawler(link)
flushPrint(year)
reports[year] = report
1954
with open('../data/gov_reports1954-2017.txt', 'w', encoding = 'utf8') as f:
for r in reports:
line = str(r)+'\t'+reports[r].replace('\n', '\t') +'\n'
f.write(line)
import pandas as pd
df = pd.read_table('../data/gov_reports1954-2017.txt', names = ['year', 'report'])
df[:3]
year | report | |
---|---|---|
0 | 1954 | 1954年政府工作报告——1954年5月23日在中华人民共和国第一届全国人民代表大会第一次会... |
1 | 1955 | 1955年国务院政府工作报告关于发展国民经济的第一个五年计划的报告 ——1955年7月5日至... |
2 | 1956 | 1956年国务院政府工作报告关于1955年国家决算和1956年国家预算的报告——1956年6... |