We'll start with a very simple chunk of XML, and then move to a more realistic example.
using LibExpat
names(LibExpat)
14-element Array{Symbol,1}: :LibExpat :XPStreamHandler :free :xpath :pause :ETree symbol("@xpath_str") :ParsedData :stop :resume :parse :XPCallbacks :parsefile :xp_parse
sm = """<blah id="42" class="top">hi
<blue id="1" class="cold">hey</blue>
<red id="2" class="hot">yo</red>
</blah>"""
"<blah id=\"42\" class=\"top\">hi\n <blue id=\"1\" class=\"cold\">hey</blue>\n <red id=\"2\" class=\"hot\">yo</red>\n</blah>"
et=xp_parse(s);
The LibExpat.jl README describes the format of element_path.
Let's check the structure of a simple ETree
esm = xp_parse(sm)
dump(esm)
ETree name: ASCIIString "blah" attr: Dict{String,String} len 2 class: ASCIIString "top" id: ASCIIString "42" elements: Array(Union(String,ETree),(8,)) ["hi","\n"," ",<blue class="cold" id="1">hey</blue>,"\n"," ",<red class="hot" id="2">yo</red>,"\n"] parent: ETree name: ASCIIString "" attr: Dict{String,String} len 0 elements: Array(Union(String,ETree),(1,)) [<blah class="top" id="42">hi <blue class="cold" id="1">hey</blue> <red class="hot" id="2">yo</red> </blah>] parent: ETree name: ASCIIString "" attr: Dict{String,String} len 0 elements: Array(Union(String,ETree),(1,)) [<blah class="top" id="42">hi <blue class="cold" id="1">hey</blue> <red class="hot" id="2">yo</red> </blah>] parent: ETree name: ASCIIString "" attr: Dict{String,String} len 0 elements: Array(Union(String,ETree),(1,)) [<blah class="top" id="42">hi <blue class="cold" id="1">hey</blue> <red class="hot" id="2">yo</red> </blah>] parent: ETree name: ASCIIString "" attr: Dict{String,String} len 0 elements: Array(Union(String,ETree),(1,)) [<blah class="top" id="42">hi <blue class="cold" id="1">hey</blue> <red class="hot" id="2">yo</red> </blah>] parent: ETree
esm.name, esm.attr
("blah",["class"=>"top","id"=>"42"])
esm.elements
8-element Array{Union(String,ETree),1}: "hi" "\n" " " <blue class="cold" id="1">hey</blue> "\n" " " <red class="hot" id="2">yo</red> "\n"
typeof(esm.elements[1]) <: String
true
for e in esm.elements
stre = strip(string(e))
if length(stre)>0
println(stre, " ", typeof(e))
if typeof(e) <: String
println("Payload: ",stre)
end
end
end
hi ASCIIString Payload: hi <blue class="cold" id="1">hey</blue> ETree <red class="hot" id="2">yo</red> ETree
Here we are scraping data from a chunk of fairly clean HTML.
s="""<div id="flight_container" style="padding: 2px;">
<table class="table_sides" width="100%" cellpadding="0" cellspacing="0" border="0" align=""><tbody><tr>
<td bgcolor="FFFFFF">
<table width="100%" border="0" cellpadding="4" cellspacing="0" class=""><thead>
<tr><td colspan="15" class="table_header" align="left">Flight Info - NXXXXXX(Rogers Bleeblah #) </td></tr>
<tr>
<td width="" class="table_row_header" align="left" valign="middle">Date</td>
<td width="" class="table_row_header" align="left" valign="middle">Origin</td>
<td width="" class="table_row_header" align="left" valign="middle">Dest</td>
<td width="" class="table_row_header" align="left" valign="middle">Depart</td>
<td width="" class="table_row_header" align="left" valign="middle">Arrive</td>
<td width="" class="table_row_header" align="left" valign="middle">Hobbs</td>
<td width="" class="table_row_header" align="left" valign="middle">Flight Time</td>
<td width="" class="table_row_header" align="left" valign="middle">Ground Time</td>
<td width="" class="table_row_header" align="left" valign="middle">Flight Distance</td>
<td width="" class="table_row_header" align="left" valign="middle">Taxi Distance</td>
<td width="" class="table_row_header" align="left" valign="middle">Fuel</td>
<td width="" class="table_row_header" align="left" valign="middle">Fuel/hr</td>
<td width="" class="table_row_header" align="left" valign="middle">Fuel/nm</td>
<td width="" class="table_row_header" align="left" valign="middle">Altitude</td>
<td width="" class="table_row_header" align="left" valign="middle">Gnd Speed</td>
</tr></thead><tbody>
<tr class="table_row1" onmouseover="style.backgroundColor='#FFF9C4'" onmouseout="style.backgroundColor='#FFFFFF'">
<td width="" class="table_td" align="left" valign="top">Mon, May xx, 2010</td>
<td width="" class="table_td" align="left" valign="top">KMYF</td>
<td width="" class="table_td" align="left" valign="top">XXXX</td>
<td width="" class="table_td" align="left" valign="top">10:44</td>
<td width="" class="table_td" align="left" valign="top">12:43</td>
<td width="" class="table_td" align="left" valign="top">1.92 hrs</td>
<td width="" class="table_td" align="left" valign="top">1.8 hrs (1:48)</td>
<td width="" class="table_td" align="left" valign="top">0.12 hrs (0:07)</td>
<td width="" class="table_td" align="left" valign="top">177.27 nm</td>
<td width="" class="table_td" align="left" valign="top">1.32 nm</td>
<td width="" class="table_td" align="left" valign="top">16.69 gal</td>
<td width="" class="table_td" align="left" valign="top">8.68 gal/hr</td>
<td width="" class="table_td" align="left" valign="top">0.09 gal/nm</td>
<td width="" class="table_td" align="left" valign="top">9511 msl</td>
<td width="" class="table_td" align="left" valign="top">95.21 kts</td>
</tr>
</tbody></table>
</td></tr></tbody></table>
</div>
""";
tds = LibExpat.find(et, "/div/table//table//td")
31-element Array{ETree,1}: <td class="table_header" align="left" colspan="15">Flight Info - NXXXXXX(Rogers Bleeblah #) </td> <td class="table_row_header" valign="middle" align="left" width="">Date</td> <td class="table_row_header" valign="middle" align="left" width="">Origin</td> <td class="table_row_header" valign="middle" align="left" width="">Dest</td> <td class="table_row_header" valign="middle" align="left" width="">Depart</td> <td class="table_row_header" valign="middle" align="left" width="">Arrive</td> <td class="table_row_header" valign="middle" align="left" width="">Hobbs</td> <td class="table_row_header" valign="middle" align="left" width="">Flight Time</td> <td class="table_row_header" valign="middle" align="left" width="">Ground Time</td> <td class="table_row_header" valign="middle" align="left" width="">Flight Distance</td> <td class="table_row_header" valign="middle" align="left" width="">Taxi Distance</td> <td class="table_row_header" valign="middle" align="left" width="">Fuel</td> <td class="table_row_header" valign="middle" align="left" width="">Fuel/hr</td> ⋮ <td class="table_td" valign="top" align="left" width="">10:44</td> <td class="table_td" valign="top" align="left" width="">12:43</td> <td class="table_td" valign="top" align="left" width="">1.92 hrs</td> <td class="table_td" valign="top" align="left" width="">1.8 hrs (1:48)</td> <td class="table_td" valign="top" align="left" width="">0.12 hrs (0:07)</td> <td class="table_td" valign="top" align="left" width="">177.27 nm</td> <td class="table_td" valign="top" align="left" width="">1.32 nm</td> <td class="table_td" valign="top" align="left" width="">16.69 gal</td> <td class="table_td" valign="top" align="left" width="">8.68 gal/hr</td> <td class="table_td" valign="top" align="left" width="">0.09 gal/nm</td> <td class="table_td" valign="top" align="left" width="">9511 msl</td> <td class="table_td" valign="top" align="left" width="">95.21 kts</td>
el = tds[1]
<td class="table_header" align="left" colspan="15">Flight Info - NXXXXXX(Rogers Bleeblah #) </td>
typeof(el)
ETree (constructor with 2 methods)
Just get the text of the element:
string(el)
"<td class=\"table_header\" align=\"left\" colspan=\"15\">Flight Info - NXXXXXX(Rogers Bleeblah #) </td>"
Check the attribute Dict to identifier elements by class
el.attr["class"]
"table_header"
get(el.attr, "class","")
"table_header"
To extract
Get the flight acid
function parse_header( hdr )
#hdr = strip(td.elements[1])
hdr = strip( split(hdr,'-')[2] )
(acid, actype) = [strip(s) for s in split(hdr,'(')]
actype = strip(replace(actype, "#)",""))
return (acid, actype)
end
parse_header (generic function with 1 method)
parse_header( "Flight Info - NXXXXXX (Rogers Bleeblah #) " )
("NXXXXXX","Rogers Bleeblah")
Extract element payloads
labels = ASCIIString[]
values = ASCIIString[]
hdr = ""
for td in tds
if get(td.attr,"class","")=="table_header"
hdr = strip(td.elements[1])
(acid, actype) = parse_header(hdr)
end
if get(td.attr,"class","")=="table_td"
push!(values, strip(td.elements[1]) )
end
if get(td.attr,"class","")=="table_row_header"
push!(labels, strip(td.elements[1]) )
end
end
acid, actype
("NXXXXXX","Rogers Bleeblah")
Load to Dict()
dmap = Dict()
for (i,el) in enumerate(labels)
v = values[i]
if '0'<=v[end]<='9'
dmap[el] = v
else
dmap[el] = split(v,' ')[1]
end
end
dump(dmap)
Dict{Any,Any} len 15 Flight Time: ASCIIString "1.8" Fuel/hr: ASCIIString "8.68" Gnd Speed: ASCIIString "95.21" Fuel: ASCIIString "16.69" Fuel/nm: ASCIIString "0.09" Hobbs: ASCIIString "1.92" Flight Distance: ASCIIString "177.27" Date: ASCIIString "Mon, May xx, 2010" Ground Time: ASCIIString "0.12" Taxi Distance: ASCIIString "1.32" Dest: ASCIIString "XXXX" ...