#!/usr/bin/env python # coding: utf-8 # # XPath # # XPath is short for XML Path Language which is a query language for selecting nodes in an XML document. This is very useful in webscraping because all HTML documents are a form of XML documents. # In[1]: import requests from lxml import html # In[2]: get_ipython().run_cell_magic('HTML', '', '\n \n

Favorite Python Librarires

Numpy
Pandas
requests

\n \n\n') # ## Load HTML Code # Now I'll read the code from cell number 2 and store it in `html_code`. Finally we will parse that into a lxml node object. # In[3]: html_code = In[2] html_code = html_code[42:-2].replace("\\n","\n") print(html_code) doc = html.fromstring(html_code) # # Using xpath to find nodes in a document # # There many methods for fidning a node that you are interested in from a XML or HTML document. The first way is to write the whole path separated by forward slashes `/` # # ## Reading `

` tag # In[4]: title = doc.xpath("/html/body/h1")[0] title # To read the text inside that tag you can use the text variable. # In[5]: title.text # Another way is read the text is to use the `text()` function in xpath. # In[6]: title = doc.xpath("/html/body/h1/text()")[0] title # ## Working with multiple items # # xpath always returns a list. If there are no matches, it will return an empty list. If there is one match it will return a list with one item. # In[7]: item_list = doc.xpath("/html/body/ul/li") item_list # We can use `text()` function with multiple items. # In[8]: doc = html.fromstring(html_code) item_list = doc.xpath("/html/body/ul/li/text()") item_list # ## Tag selector without full path # # you can select any node in your document that matches a node selector without using the full path with a double forward slash `//` # In[9]: doc = html.fromstring(html_code) item_list = doc.xpath("//li/text()") item_list # ## Selecting one result # # You can select one result from a list using `[index]` after your tag selector. Make sure you use it on the tag selector and not a function selector. # # Notice: This is `index` starts from 1. # In[10]: doc = html.fromstring(html_code) item_list = doc.xpath("/html/body/ul/li[1]/text()") item_list # In[11]: get_ipython().run_cell_magic('HTML', '', '\n \n

Favorite Python Librarires

Numpy
Pandas
requests

Favorite JS Librarires

Bootstrap
jQuery
d3.js

\n\n') # In[12]: html_code = In[11] html_code = html_code[42:-2].replace("\\n","\n") print(html_code) doc = html.fromstring(html_code) # # Attributes selector # # In this example we have two `

` tags with different css classes. We can select tags based on css classes as follows: # In[13]: title = doc.xpath("/html/body/h1[@class='text-muted']/text()")[0] title # ## `contains()` function # # I want to select all items in the first list. I could use the full class for selection or I could just use one of the classed only used in the first list with the `contains()` function. # In[14]: item_list = doc.xpath("/html/body/ul[contains(@class,'nav-stacked')]/li/a/text()") item_list # ## Returning attributes # # What if we want to read the `href` attribute of the `` tag to get the link. This is how you do that: # In[15]: item_list = doc.xpath("/html/body/ul[contains(@class,'nav-stacked')]/li/a/@href") item_list # # Real world example # # Read the list of languages with 1M+ articles on http://www.wikipedia.org/ # In[16]: response = requests.get("http://www.wikipedia.org") doc = html.fromstring(response.content, parser=html.HTMLParser(encoding="utf-8")) # In[17]: lang_list = doc.xpath("//div[@class='langlist langlist-large hlist'][1]/ul/li/a/text()") lang_list # In[ ]: