doc = xmlTreeParse(system.file("exampleData", "tagnames.xml", package = "XML"), useInternalNodes = TRUE)
getNodeSet(doc, "/doc//b[@status]")
getNodeSet(doc, "/doc//b[@status='foo']")
els = getNodeSet(doc, "/doc//a[@status]")
sapply(els, function(el) xmlGetAttr(el, "status"))
free(doc)
# Using a namespace
f = system.file("exampleData", "SOAPNamespaces.xml", package = "XML")
z = xmlTreeParse(f, useInternal = TRUE)
getNodeSet(z, "/a:Envelope/a:Body", c("a" = "http://schemas.xmlsoap.org/soap/envelope/"))
getNodeSet(z, "//a:Body", c("a" = "http://schemas.xmlsoap.org/soap/envelope/"))
free(z)
# Get two items back with namespaces
f = system.file("exampleData", "gnumeric.xml", package = "XML")
z = xmlTreeParse(f, useInternalNodes = TRUE)
getNodeSet(z, "//gmr:Item/gmr:name", c(gmr="http://www.gnome.org/gnumeric/v2"))
free(z)
#####
# European Central Bank (ECB) exchange rate data
# Data is available from "http://www.ecb.int/stats/eurofxref/eurofxref-hist.xml"
# or locally.
uri = system.file("exampleData", "eurofxref-hist.xml.gz", package = "XML")
doc = xmlTreeParse(uri, useInternalNodes = TRUE)
# The default namespace for all elements is given by
namespaces <- c(ns="http://www.ecb.int/vocabulary/2002-08-01/eurofxref")
# Get the data for Slovenian currency for all time periods.
# Find all the nodes of the form <Cube currency="SIT"...>
slovenia = getNodeSet(doc, "//ns:Cube[@currency='SIT']", namespaces )
# Now we have a list of such nodes, loop over them
# and get the rate attribute
rates = as.numeric( sapply(slovenia, xmlGetAttr, "rate") )
# Now put the date on each element
# find nodes of the form <Cube time=".." ... >
# and extract the time attribute
names(rates) = sapply(getNodeSet(doc, "//ns:Cube[@time]", namespaces ),
xmlGetAttr, "time")
# Or we could turn these into dates with strptime()
strptime(names(rates), "%Y-%m-%d")
# Using xpathApply, we can do
rates = xpathApply(doc, "//ns:Cube[@currency='SIT']", xmlGetAttr, "rate", namespaces = namespaces )
rates = as.numeric(unlist(rates))
# Using an expression rather than a function and ...
rates = xpathApply(doc, "//ns:Cube[@currency='SIT']", quote(xmlGetAttr(x, "rate")), namespaces = namespaces )
free(doc)
#
uri = system.file("exampleData", "namespaces.xml", package = "XML")
d = xmlTreeParse(uri, useInternalNodes = TRUE)
getNodeSet(d, "//c:c", c(c="http://www.c.org"))
# the following, perhaps unexpectedly but correctly, returns an empty
# with no matches
getNodeSet(d, "//defaultNs", "http://www.omegahat.org")
# But if we create our own prefix for the evaluation of the XPath
# expression and use this in the expression, things work as one
# might hope.
getNodeSet(d, "//dummy:defaultNs", c(dummy = "http://www.omegahat.org"))
# And since the default value for the namespaces argument is the
# default namespace of the document with the prefix 'd', we can use
getNodeSet(d, "//d:defaultNs")
# And the syntactic sugar is
d["//d:defaultNs"]
free(d)
# Work with the nodes and their content (not just attributes) from the node set.
# From bondsTables.R in examples/
doc = htmlTreeParse("http://finance.yahoo.com/bonds/composite_bond_rates", useInternalNodes = TRUE)
# Use XPath expression to find the nodes
# <div><table class="yfirttbl">..
# as these are the ones we want.
o = getNodeSet(doc, "//div/table[@class='yfirttbl']")
# Write a function that will extract the information out of a given table node.
readHTMLTable =
function(tb)
{
# get the header information.
colNames = sapply(tb[["thead"]][["tr"]]["th"], xmlValue)
vals = sapply(tb[["tbody"]]["tr"], function(x) sapply(x["td"], xmlValue))
matrix(as.numeric(vals[-1,]),
nrow = ncol(vals),
dimnames = list(vals[1,], colNames[-1]),
byrow = TRUE
)
}
# Now process each of the table nodes in the o list.
tables = lapply(o, readHTMLTable)
names(tables) = lapply(o, function(x) xmlValue(x[["caption"]]))Run the code above in your browser using DataLab