# NOT RUN {
# }
# NOT RUN {
Rcrawler(Website ="http://glofile.com/", no_cores = 4, no_conn = 4)
#Crawl, index, and store web pages using 4 cores and 4 parallel requests
Rcrawler(Website = "http://glofile.com/", urlregexfilter = "/[0-9]{4}/[0-9]{2}/",
ExtractPatterns = c("//*/article","//*/h1"), PatternsNames = c("content","title"))
#Crawl the website using the default configuration and scrape content matching two XPath
patterns only from post pages matching a specific regular expression "/[0-9]{4}/[0-9]{2}/".
Note that the user can use the excludepattern parameter to exclude a node from being extracted,
e.g., in the case that a desired node includes (is a parent of) an undesired "child" node.
Rcrawler(Website = "http://www.example.com/", no_cores=8, no_conn=8, Obeyrobots = TRUE,
Useragent="Mozilla 3.11")
# Crawl and index the website using 8 cores and 8 parallel requests with respect to
robot.txt rules.
Rcrawler(Website = "http://www.example.com/", no_cores = 4, no_conn = 4,
urlregexfilter = "/[0-9]{4}/[0-9]{2}/", DIR = "./myrepo", MaxDepth=3)
# Crawl the website using 4 cores and 4 parallel requests. However, this will only
index URLs matching the regular expression pattern (/[0-9]{4}/[0-9]{2}/), and stores pages
in a custom directory "myrepo".
The crawler stops After reaching the third level of website depth.
Rcrawler(Website = "http://www.example.com/", KeywordsFilter = c("keyword1", "keyword2"))
# Crawl the website and collect only webpages containing keyword1 or keyword2 or both.
Rcrawler(Website = "http://www.example.com/", KeywordsFilter = c("keyword1", "keyword2"),
KeywordsAccuracy = 50)
# Crawl the website and collect only webpages that has an accuracy percentage higher than 50%
of matching keyword1 and keyword2.
Rcrawler(Website = "http://glofile.com/" , no_cores = 4, no_conn = 4, GraphData = TRUE)
# Crawl the entire website, and create network edges DATA of internal links.
# Using Igraph for exmaple you can plot the network by the following commands
# library(igraph)
# network<-graph.data.frame(NetwEdges, directed=T)
# plot(network)
# }
# NOT RUN {
# }
Run the code above in your browser using DataLab