u = "http://en.wikipedia.org/wiki/World_population"
tables = readHTMLTable(u)
names(tables)
tables[[2]]
# Print the table. Note that the values are all characters
# not numbers. Also the column names have a preceding X since
# R doesn't allow the variable names to start with digits.
tmp = tables[[2]]
# We can transform this to get the rows to be years and the columns
# to be population counts. We'll create a matrix.
vals = cbind(year = as.integer(gsub("X", "", names(tmp)[-1])),
matrix(as.integer(gsub(",", "", as.character(unlist(tmp[-1])))),
ncol(tmp)-1, byrow = TRUE, dimnames = list(NULL, as.character(tmp[[1]]))))
# Let's just read the second table directly by itself.
doc = htmlParse(u)
tableNodes = getNodeSet(doc, "//table")
tb = readHTMLTable(tableNodes[[2]])
# Let's try to adapt the values on the fly.
# We'll create a function that turns a th/td node into a val
tryAsInteger = function(node) {
val = xmlValue(node)
ans = as.integer(gsub(",", "", val))
if(is.na(ans))
val
else
ans
}
tb = readHTMLTable(tableNodes[[2]], elFun = tryAsInteger)
tb = readHTMLTable(tableNodes[[2]], elFun = tryAsInteger,
colClasses = c("character", rep("integer", 9)))Run the code above in your browser using DataLab