#!/usr/bin/env python # coding: utf-8 # # # 数据抓取： # # > # Requests、Beautifulsoup、Xpath简介 # *** # # 王成军 # # wangchengjun@nju.edu.cn # # 计算传播网 http://computational-communication.com # In[12]: get_ipython().run_line_magic('pinfo', 'display_html') # In[11]: # 爬虫基本原理 from IPython.display import display_html, HTML HTML(url="http://www.cnblogs.com/zhaof/p/6898138.html") # # 需要解决的问题 # # - 页面解析 # - 获取Javascript隐藏源数据 # - 自动翻页 # - 自动登录 # - 连接API接口 # # - 一般的数据抓取，使用requests和beautifulsoup配合就可以了。 # - 尤其是对于翻页时url出现规则变化的网页，只需要处理规则化的url就可以了。 # - 以简单的例子是抓取天涯论坛上关于某一个关键词的帖子。 # - 在天涯论坛，关于雾霾的帖子的第一页是： # http://bbs.tianya.cn/list.jsp?item=free&nextid=0&order=8&k=雾霾 # - 第二页是： # http://bbs.tianya.cn/list.jsp?item=free&nextid=1&order=8&k=雾霾 # # # 第一个爬虫 # # Beautifulsoup Quick Start # # http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # # ![](./img/bs.jpg) # # http://computational-class.github.io/bigdata/data/test.html # In[3]: import requests from bs4 import BeautifulSoup # In[53]: help(requests.get) # In[5]: url = 'http://computational-class.github.io/bigdata/data/test.html' content = requests.get(url) help(content) # In[6]: print(content.text) # In[7]: content.encoding # # Beautiful Soup # > Beautiful Soup is a Python library designed for quick turnaround projects like screen-scraping. Three features make it powerful: # # - Beautiful Soup provides a few simple methods. It doesn't take much code to write an application # - Beautiful Soup automatically converts incoming documents to Unicode and outgoing documents to UTF-8. Then you just have to specify the original encoding. # - Beautiful Soup sits on top of popular Python parsers like `lxml` and `html5lib`. # # # Install beautifulsoup4 # # ### open your terminal/cmd # # $ pip install beautifulsoup4 # # html.parser # Beautiful Soup supports the html.parser included in Python’s standard library # # # lxml # but it also supports a number of third-party Python parsers. One is the lxml parser `lxml`. Depending on your setup, you might install lxml with one of these commands: # # > $ apt-get install python-lxml # # > $ easy_install lxml # # > $ pip install lxml # # html5lib # Another alternative is the pure-Python html5lib parser `html5lib`, which parses HTML the way a web browser does. Depending on your setup, you might install html5lib with one of these commands: # # > $ apt-get install python-html5lib # # > $ easy_install html5lib # # > $ pip install html5lib # In[9]: url = 'http://computational-class.github.io/bigdata/data/test.html' content = requests.get(url) content = content.text soup = BeautifulSoup(content, 'html.parser') soup # In[10]: print(soup.prettify()) # - html # - head # - title # - body # - p (class = 'title', 'story' ) # - a (class = 'sister') # - href/id # # Select 方法 # # # - 标签名不加任何修饰 # - 类名前加点 # - id名前加 # # # 我们也可以利用这种特性，使用soup.select()方法筛选元素，返回类型是 list # ## Select方法三步骤 # # - Inspect (检查) # - Copy # - Copy Selector # - 鼠标选中标题`The Dormouse's story`, 右键检查Inspect # - 鼠标移动到选中的源代码 # - 右键Copy-->Copy Selector # # `body > p.title > b` # # In[14]: soup.select('body > p.title > b')#[0].text # ### Select 方法: 通过标签名查找 # In[68]: soup.select('title') # In[65]: soup.select('a') # In[66]: soup.select('b') # ### Select 方法: 通过类名查找 # In[69]: soup.select('.title') # In[26]: soup.select('.sister') # In[27]: soup.select('.story') # ### Select 方法: 通过id名查找 # In[15]: soup.select('#link1') # In[16]: soup.select('#link1')[0]['href'] # ### Select 方法: 组合查找 # # 将标签名、类名、id名进行组合 # # - 例如查找 p 标签中，id 等于 link1的内容 # # In[70]: soup.select('p #link1') # ### Select 方法:属性查找 # # 加入属性元素 # - 属性需要用中括号`>`连接 # - 属性和标签属于同一节点，中间不能加空格。 # # # # In[17]: soup.select("head > title") # In[72]: soup.select("body > p") # # find_all方法 # In[30]: soup('p') # In[31]: soup.find_all('p') # In[32]: [i.text for i in soup('p')] # In[34]: for i in soup('p'): print(i.text) # In[35]: for tag in soup.find_all(True): print(tag.name) # In[36]: soup('head') # or soup.head # In[37]: soup('body') # or soup.body # In[38]: soup('title') # or soup.title # In[39]: soup('p') # In[40]: soup.p # In[41]: soup.title.name # In[42]: soup.title.string # In[43]: soup.title.text # 推荐使用text方法 # In[44]: soup.title.parent.name # In[45]: soup.p # In[46]: soup.p['class'] # In[47]: soup.find_all('p', {'class', 'title'}) # In[19]: soup.find_all('p', class_= 'title') # In[49]: soup.find_all('p', {'class', 'story'}) # In[34]: soup.find_all('p', {'class', 'story'})[0].find_all('a') # In[51]: soup.a # In[52]: soup('a') # In[53]: soup.find(id="link3") # In[54]: soup.find_all('a') # In[55]: soup.find_all('a', {'class', 'sister'}) # compare with soup.find_all('a') # In[56]: soup.find_all('a', {'class', 'sister'})[0] # In[57]: soup.find_all('a', {'class', 'sister'})[0].text # In[58]: soup.find_all('a', {'class', 'sister'})[0]['href'] # In[59]: soup.find_all('a', {'class', 'sister'})[0]['id'] # In[71]: soup.find_all(["a", "b"]) # In[38]: print(soup.get_text()) # *** # *** # # 数据抓取： # > # 抓取微信公众号文章内容 # *** # *** # # 王成军 # # wangchengjun@nju.edu.cn # # 计算传播网 http://computational-communication.com # # In[16]: from IPython.display import display_html, HTML HTML(url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ5MTE3OA==&mid=206241627&idx=1&sn=471e59c6cf7c8dae452245dbea22c8f3&3rd=MzA3MDU4NTYzMw==&scene=6#rd') # the webpage we would like to crawl # # 查看源代码 Inspect # In[36]: url = "http://mp.weixin.qq.com/s?__biz=MzA3MjQ5MTE3OA==&mid=206241627&idx=1&sn=471e59c6cf7c8dae452245dbea22c8f3&3rd=MzA3MDU4NTYzMw==&scene=6#rd" content = requests.get(url).text #获取网页的html文本 soup = BeautifulSoup(content, 'html.parser') # In[37]: title = soup.select("#activity-name") title[0].text.strip() # In[40]: soup.find('h2', {'class', 'rich_media_title'}).text.strip() # In[185]: print(soup.find('div', {'class', 'rich_media_meta_list'}) ) # In[42]: soup.find('em').text # In[43]: article = soup.find('div', {'class' , 'rich_media_content'}).text print(article) # In[44]: rmml = soup.find('div', {'class', 'rich_media_meta_list'}) date = rmml.find(id = 'post-date').text rmc = soup.find('div', {'class', 'rich_media_content'}) content = rmc.get_text() print(title[0].text.strip()) print(date) print(content) # # requests + Xpath方法介绍：以豆瓣电影为例 # # Xpath 即为 XML 路径语言（XML Path Language），它是一种用来确定 XML 文档中某部分位置的语言。 # # Xpath 基于 XML 的树状结构，提供在数据结构树中找寻节点的能力。起初 Xpath 的提出的初衷是将其作为一个通用的、介于 Xpointer 与 XSL 间的语法模型。但是Xpath 很快的被开发者采用来当作小型查询语言。 # # # 获取元素的Xpath信息并获得文本： # 这里的“元素的Xpath信息”是需要我们手动获取的，获取方式为： # - 定位目标元素 # - 在网站上依次点击：右键 > 检查 # - copy xpath # - xpath + '/text()' # # 参考：https://mp.weixin.qq.com/s/zx3_eflBCrrfOqFEWjAUJw # # In[46]: import requests from lxml import etree url = 'https://movie.douban.com/subject/26611804/' data = requests.get(url).text s = etree.HTML(data) # 豆瓣电影的名称对应的的xpath为xpath_title，那么title表达为： # # `title = s.xpath('xpath_info/text()')` # # 其中，xpath_info为： # # `//*[@id="content"]/h1/span[1]` # # In[47]: title = s.xpath('//*[@id="content"]/h1/span[1]/text()')[0] director = s.xpath('//*[@id="info"]/span[1]/span[2]/a/text()') actors = s.xpath('//*[@id="info"]/span[3]/span[2]/a/text()') type1 = s.xpath('//*[@id="info"]/span[5]/text()') type2 = s.xpath('//*[@id="info"]/span[6]/text()') type3 = s.xpath('//*[@id="info"]/span[7]/text()') time = s.xpath('//*[@id="info"]/span[11]/text()') length = s.xpath('//*[@id="info"]/span[13]/text()') score = s.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0] # In[48]: print(title, director, actors, type1, type2, type3, time, length, score) # ## Douban API # # https://developers.douban.com/wiki/?title=guide # In[16]: get_ipython().system('pip install fake-useragent') # In[20]: from fake_useragent import UserAgent import requests ua = UserAgent(cache=True, use_cache_server=True) # In[21]: #请求头 #headers={"User-Agent":ua.chrome} headers = {'User-Agent':ua.chrome} #请求网址 url = 'https://api.douban.com/v2/user/1000001' response=requests.get(url=url, headers=headers) # In[92]: import requests url = 'https://api.douban.com/v2/movie/26611804' #url = 'https://api.douban.com/v2/user/1000001/' jsonm = requests.get(url).json() # In[93]: jsonm # In[89]: #jsonm.values() jsonm.keys(), jsonm['rating'] # In[84]: jsonm['alt'] # In[87]: jsonm['attrs']['director'] # In[89]: jsonm['attrs']['movie_type'] # In[88]: jsonm['attrs']['cast'] # In[129]: headers = { 'Host': 'api.douban.com', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6', 'Cookie': 'gr_user_id=54559934-955b-4798-9df1-ed12a97b61b1; ue="wangchj04@126.com"; _ga=GA1.2.1584253277.1448983887; _vwo_uuid_v2=7CD7A27EE46C68D5713E8870DCBB0C50|a39513af0c4457f727aeb9dcb79c7867; douban-profile-remind=1; douban-fav-remind=1; bid=-n0SJDzOCOU; ll="118159"; __gads=ID=223032a1f45c3c9d:T=1541658183:S=ALNI_MY65rcbNHf8eTpIzbr9MTNv1lhuSg; push_doumail_num=0; UM_distinctid=167a02c6a9b273-064d1e75f342f3-35677603-fa000-167a02c6a9c203; __utmv=30149280.155; ct=y; push_noty_num=0; __utmc=30149280; __utmz=30149280.1548213414.56.6.utmcsr=book.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/subject/2893874/comments/; viewed="1536615"; dbcl2="1558440:4omV9m7YBqg"; ck=AdQI; ap_v=0,6.0; __utma=30149280.1584253277.1448983887.1548653281.1548735278.59; __utmt=1; __utmb=30149280.17.5.1548735287585' } cookies={} raw_cookies = headers['Cookie'] for line in raw_cookies.split(';'): key,value=line.split('=',1)#1代表只分一次，得到两个数据 cookies[key]=value cookies # In[135]: import requests url = 'https://api.douban.com/v2/user/1000001/' jsonm = requests.get(url, cookies = cookies)#.json() # In[136]: jsonm # In[134]: jsonm.request.headers # # requests.post模拟登录豆瓣（包括获取验证码） # https://blog.csdn.net/zhuzuwei/article/details/80875538 # ## 作业：抓取豆瓣电影 Top 250 # In[59]: import requests from bs4 import BeautifulSoup from lxml import etree url0 = 'https://movie.douban.com/top250?start=0&filter=' data = requests.get(url0).text s = etree.HTML(data) # In[222]: s.xpath('//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[1]/a/span[1]/text()')[0] # In[225]: s.xpath('//*[@id="content"]/div/div[1]/ol/li[2]/div/div[2]/div[1]/a/span[1]/text()')[0] # In[227]: s.xpath('//*[@id="content"]/div/div[1]/ol/li[3]/div/div[2]/div[1]/a/span[1]/text()')[0] # In[60]: import requests from bs4 import BeautifulSoup url0 = 'https://movie.douban.com/top250?start=0&filter=' data = requests.get(url0).text soup = BeautifulSoup(data, 'lxml') # In[61]: movies = soup.find_all('div', {'class', 'info'}) # In[62]: len(movies) # In[63]: movies[0].a['href'] # In[39]: movies[0].find('span', {'class', 'title'}).text # In[26]: movies[0].find('div', {'class', 'star'}) # In[28]: movies[0].find('span', {'class', 'rating_num'}).text # In[90]: people_num = movies[0].find('div', {'class', 'star'}).find_all('span')[-1] people_num.text.split('人评价')[0] # In[64]: for i in movies: url = i.a['href'] title = i.find('span', {'class', 'title'}).text des = i.find('div', {'class', 'star'}) rating = des.find('span', {'class', 'rating_num'}).text rating_num = des.find_all('span')[-1].text.split('人评价')[0] print(url, title, rating, rating_num) # In[51]: for i in range(0, 250, 25): print('https://movie.douban.com/top250?start=%d&filter='% i) # In[65]: import requests from bs4 import BeautifulSoup dat = [] for j in range(0, 250, 25): urli = 'https://movie.douban.com/top250?start=%d&filter='% j data = requests.get(urli).text soup = BeautifulSoup(data, 'lxml') movies = soup.find_all('div', {'class', 'info'}) for i in movies: url = i.a['href'] title = i.find('span', {'class', 'title'}).text des = i.find('div', {'class', 'star'}) rating = des.find('span', {'class', 'rating_num'}).text rating_num = des.find_all('span')[-1].text.split('人评价')[0] listi = [url, title, rating, rating_num] dat.append(listi) # In[66]: import pandas as pd df = pd.DataFrame(dat, columns = ['url', 'title', 'rating', 'rating_num']) df['rating'] = df.rating.astype(float) df['rating_num'] = df.rating_num.astype(int) df.head() # In[3]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.hist(df.rating_num) plt.show() # In[19]: plt.hist(df.rating) plt.show() # In[11]: fig = plt.figure(figsize=(16, 16),facecolor='white') plt.plot(df.rating_num, df.rating, 'bo') for i in df.index: plt.text(df.rating_num[i], df.rating[i], df.title[i], fontsize = df.rating[i], color = 'red', rotation = 45) plt.show() # In[123]: df[df.rating > 9.4] # In[69]: alist = [] for i in df.index: alist.append( [df.rating_num[i], df.rating[i], df.title[i] ]) blist =[[df.rating_num[i], df.rating[i], df.title[i] ] for i in df.index] alist # In[70]: from IPython.display import display_html, HTML HTML('') # # 作业： # # - 抓取复旦新媒体微信公众号最新一期的内容 # # # 抓取江苏省政协十年提案 # In[82]: # headers = { # 'Accept': 'application/json, text/javascript, */*; q=0.01', # 'Accept-Encoding': 'gzip, deflate', # 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6', # 'Cache-Control': 'no-cache', # 'Connection': 'keep-alive', # 'Cookie': 'JSESSIONID=992CB756ADE61B87409672DC808FDD92', # 'DNT': '1', # 'Host': 'www.jszx.gov.cn', # 'Pragma': 'no-cache', # 'Referer': 'http://www.jszx.gov.cn/zxta/2019ta/', # 'Upgrade-Insecure-Requests': '1', # 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1' # } # 打开http://www.jszx.gov.cn/zxta/2019ta/ # # - 点击下一页，url不变! # # > 所以数据的更新是使用js推送的 # - 分析network中的内容，发现proposalList.jsp # - 查看它的header，并发现了form_data # # # http://www.jszx.gov.cn/zxta/2019ta/ # In[4]: import requests from bs4 import BeautifulSoup # In[5]: form_data = {'year':2019, 'pagenum':1, 'pagesize':20 } url = 'http://www.jszx.gov.cn/wcm/zxweb/proposalList.jsp' content = requests.get(url, form_data) content.encoding = 'utf-8' js = content.json() # In[6]: js['data']['totalcount'] # In[7]: dat = js['data']['list'] pagenum = js['data']['pagecount'] # ### 抓取所有提案的链接 # In[147]: for i in range(2, pagenum+1): print(i) form_data['pagenum'] = i content = requests.get(url, form_data) content.encoding = 'utf-8' js = content.json() for j in js['data']['list']: dat.append(j) # In[149]: len(dat) # In[150]: dat[0] # In[155]: import pandas as pd df = pd.DataFrame(dat) df.head() # In[158]: df.groupby('type').size() # ### 抓取提案内容 # http://www.jszx.gov.cn/zxta/2019ta/index_61.html?pkid=18b1b347f9e34badb8934c2acec80e9e # http://www.jszx.gov.cn/wcm/zxweb/proposalInfo.jsp?pkid=18b1b347f9e34badb8934c2acec80e9e # In[163]: url_base = 'http://www.jszx.gov.cn/wcm/zxweb/proposalInfo.jsp?pkid=' urls = [url_base + i for i in df['pkid']] # In[176]: import sys def flushPrint(www): sys.stdout.write('\r') sys.stdout.write('%s' % www) sys.stdout.flush() text = [] for k, i in enumerate(urls): flushPrint(k) content = requests.get(i) content.encoding = 'utf-8' js = content.json() js = js['data']['binfo']['_content'] soup = BeautifulSoup(js, 'html.parser') text.append(soup.text) # In[177]: len(text) # In[178]: df['content'] = text # In[179]: df.head() # In[181]: df.to_csv('../data/jszx2019.csv', index = False) # In[182]: dd = pd.read_csv('../data/jszx2019.csv') dd.head() # In[ ]: