# u = "http://en.wikipedia.org/wiki/World_population"
u = "http://en.wikipedia.org/wiki/List_of_countries_by_population"
tables = readHTMLTable(u)
names(tables)
tables[[2]]
# Print the table. Note that the values are all characters
# not numbers. Also the column names have a preceding X since
# R doesn't allow the variable names to start with digits.
tmp = tables[[2]]
# Let's just read the second table directly by itself.
doc = htmlParse(u)
tableNodes = getNodeSet(doc, "//table")
tb = readHTMLTable(tableNodes[[2]])
# Let's try to adapt the values on the fly.
# We'll create a function that turns a th/td node into a val
tryAsInteger = function(node) {
val = xmlValue(node)
ans = as.integer(gsub(",", "", val))
if(is.na(ans))
val
else
ans
}
tb = readHTMLTable(tableNodes[[2]], elFun = tryAsInteger)
tb = readHTMLTable(tableNodes[[2]], elFun = tryAsInteger,
colClasses = c("character", rep("integer", 9)))
zz =
readHTMLTable("http://www.inflationdata.com/Inflation/Consumer_Price_Index/HistoricalCPI.aspx")
if(any(i <- sapply(zz, function(x) if(is.null(x)) 0 else ncol(x)) == 14)) {
# guard against the structure of the page changing.
zz = zz[[which(i)[1]]] # 4th table
# convert columns to numeric. Could use colClasses in the call to readHTMLTable()
zz[-1] = lapply(zz[-1], function(x) as.numeric(gsub(".* ", "", as.character(x))))
matplot(1:12, t(zz[-c(1, 14)]), type = "l")
}
# From Marsh Feldman on R-help
doc <- "http://www.nber.org/cycles/cyclesmain.html"
# The main table is the second one because it's embedded in the page table.
table <- getNodeSet(htmlParse(doc),"//table") [[2]]
xt <- readHTMLTable(table,
header = c("peak","trough","contraction",
"expansion","trough2trough","peak2peak"),
colClasses = c("character","character","character",
"character","character","character"),
trim = TRUE, stringsAsFactors = FALSE
)
if(FALSE) {
# Here is a totally different way of reading tables from HTML documents.
# The data are formatted using a PRE and so can be read via read.table
u = "http://tidesonline.nos.noaa.gov/data_read.shtml?station_info=9414290+San+Francisco,+CA"
h = htmlParse(u)
p = getNodeSet(h, "//pre")
con = textConnection(xmlValue(p[[2]]))
tides = read.table(con)
}
if(require(RCurl) && url.exists("http://www.omegahat.org/RCurl/testPassword/table.html")) {
tt = getURL("http://www.omegahat.org/RCurl/testPassword/table.html", userpwd = "bob:duncantl")
readHTMLTable(tt)
}
Run the code above in your browser using DataLab