# NOT RUN {
# }
# NOT RUN {
Rcrawler(Website ="http://glofile.com/", no_cores = 4, no_conn = 4)
#Crawl, index, and store web pages using 4 cores and 4 parallel requests
Rcrawler(Website = "http://glofile.com/", urlregexfilter = "/[0-9]{4}/[0-9]{2}/",
ExtractPatterns = c("//*/article","//*/h1"), PatternsNames = c("content","title"))
#Crawl the website using the default configuration and scrape content matching two XPath
patterns only from post pages matching a specific regular expression "/[0-9]{4}/[0-9]{2}/".
Note that the user can use the excludepattern parameter to exclude a node from being extracted,
e.g., in the case that a desired node includes (is a parent of) an undesired "child" node.
Rcrawler(Website = "http://www.example.com/", no_cores=8, no_conn=8, Obeyrobots = TRUE,
Useragent="Mozilla 3.11")
# Crawl and index the website using 8 cores and 8 parallel requests with respect to
robot.txt rules.
Rcrawler(Website = "http://www.example.com/", no_cores = 4, no_conn = 4,
urlregexfilter = "/[0-9]{4}/[0-9]{2}/", DIR = "./myrepo", MaxDepth=3)
# Crawl the website using 4 cores and 4 parallel requests. However, this will only
index URLs matching the regular expression pattern (/[0-9]{4}/[0-9]{2}/), and stores pages
in a custom directory "myrepo". The crawler stops when it reaches the third level.
# }
# NOT RUN {
# }
Run the code above in your browser using DataLab