u = "http://en.wikipedia.org/wiki/World_population"
tables = readHTMLTable(u)
names(tables)
tables[[2]]
# Print the table. Note that the values are all characters
# not numbers. Also the column names have a preceding X since
# R doesn't allow the variable names to start with digits.
tmp = tables[[2]]
# We can transform this to get the rows to be years and the columns
# to be population counts. We'll create a matrix.
vals = cbind(year = as.integer(gsub("X", "", names(tmp)[-1])),
matrix(as.integer(gsub(",", "", as.character(unlist(tmp[-1])))),
ncol(tmp)-1, byrow = TRUE, dimnames = list(NULL, as.character(tmp[[1]]))))
# Let's just read the second table directly by itself.
doc = htmlParse(u)
tableNodes = getNodeSet(doc, "//table")
tb = readHTMLTable(tableNodes[[2]])
# Let's try to adapt the values on the fly.
# We'll create a function that turns a th/td node into a val
tryAsInteger = function(node) {
val = xmlValue(node)
ans = as.integer(gsub(",", "", val))
if(is.na(ans))
val
else
ans
}
tb = readHTMLTable(tableNodes[[2]], elFun = tryAsInteger)
tb = readHTMLTable(tableNodes[[2]], elFun = tryAsInteger,
colClasses = c("character", rep("integer", 9)))
zz = readHTMLTable("http://www.inflationdata.com/Inflation/Consumer_Price_Index/HistoricalCPI.aspx")
zz = zz[[4]] # 4th table
# convert columns to numeric. Could use colClasses in the call to readHTMLTable()
zz[-1] = lapply(zz[-1], function(x) as.numeric(gsub(".* ", "", as.character(x))))
matplot(1:12, t(zz[-c(1, 14)]), type = "l")
# From Marsh Feldman on R-help
doc <- "http://www.nber.org/cycles/cyclesmain.html"
# The main table is the second one because it's embedded in the page table.
table <- getNodeSet(htmlParse(doc),"//table") [[2]]
xt <- readHTMLTable(table,
header = c("peak","trough","contraction",
"expansion","trough2trough","peak2peak"),
colClasses = c("character","character","character",
"character","character","character"),
trim = TRUE, stringsAsFactors = FALSE
)
Run the code above in your browser using DataLab