#!/usr/bin/env python
# coding: utf-8
# # XPath
#
# XPath is short for XML Path Language which is a query language for selecting nodes in an XML document. This is very useful in webscraping because all HTML documents are a form of XML documents.
# In[1]:
import requests
from lxml import html
# In[2]:
get_ipython().run_cell_magic('HTML', '', '\n
\n Favorite Python Librarires
\n \n - Numpy
\n - Pandas
\n - requests
\n
\n \n\n')
# ## Load HTML Code
# Now I'll read the code from cell number 2 and store it in `html_code`. Finally we will parse that into a lxml node object.
# In[3]:
html_code = In[2]
html_code = html_code[42:-2].replace("\\n","\n")
print(html_code)
doc = html.fromstring(html_code)
# # Using xpath to find nodes in a document
#
# There many methods for fidning a node that you are interested in from a XML or HTML document. The first way is to write the whole path separated by forward slashes `/`
#
# ## Reading `` tag
# In[4]:
title = doc.xpath("/html/body/h1")[0]
title
# To read the text inside that tag you can use the text variable.
# In[5]:
title.text
# Another way is read the text is to use the `text()` function in xpath.
# In[6]:
title = doc.xpath("/html/body/h1/text()")[0]
title
# ## Working with multiple items
#
# xpath always returns a list. If there are no matches, it will return an empty list. If there is one match it will return a list with one item.
# In[7]:
item_list = doc.xpath("/html/body/ul/li")
item_list
# We can use `text()` function with multiple items.
# In[8]:
doc = html.fromstring(html_code)
item_list = doc.xpath("/html/body/ul/li/text()")
item_list
# ## Tag selector without full path
#
# you can select any node in your document that matches a node selector without using the full path with a double forward slash `//`
# In[9]:
doc = html.fromstring(html_code)
item_list = doc.xpath("//li/text()")
item_list
# ## Selecting one result
#
# You can select one result from a list using `[index]` after your tag selector. Make sure you use it on the tag selector and not a function selector.
#
# **Notice**: This is `index` starts from 1.
# In[10]:
doc = html.fromstring(html_code)
item_list = doc.xpath("/html/body/ul/li[1]/text()")
item_list
# In[11]:
get_ipython().run_cell_magic('HTML', '', '\n \n Favorite Python Librarires
\n \n Favorite JS Librarires
\n \n\n')
# In[12]:
html_code = In[11]
html_code = html_code[42:-2].replace("\\n","\n")
print(html_code)
doc = html.fromstring(html_code)
# # Attributes selector
#
# In this example we have two `