# NOT RUN {
# make connection object
(x <- connect())
# load some data
if (!index_exists(x, "shakespeare")) {
  shakespeare <- system.file("examples", "shakespeare_data.json",
    package = "elastic")
  shakespeare <- type_remover(shakespeare)
  invisible(docs_bulk(x, shakespeare))
}
if (!index_exists(x, "gbif")) {
  gbif <- system.file("examples", "gbif_data.json",
    package = "elastic")
  gbif <- type_remover(gbif)
  invisible(docs_bulk(x, gbif))
}
if (!index_exists(x, "plos")) {
  plos <- system.file("examples", "plos_data.json",
    package = "elastic")
  plos <- type_remover(plos)
  invisible(docs_bulk(x, plos))
}
# URI string queries
Search(x, index="shakespeare")
## if you're using an older ES version, you may have types
if (gsub("\\.", "", x$ping()$version$number) < 700) {
  Search(x, index="shakespeare", type="act")
  Search(x, index="shakespeare", type="scene")
  Search(x, index="shakespeare", type="line")
}
## Return certain fields
if (gsub("\\.", "", x$ping()$version$number) < 500) {
  ### ES < v5
  Search(x, index="shakespeare", fields=c('play_name','speaker'))
} else {
  ### ES > v5
  Search(x, index="shakespeare", body = '{
   "_source": ["play_name", "speaker"]
  }')
}
## Search multiple indices
Search(x, index = "gbif")$hits$total$value
Search(x, index = "shakespeare")$hits$total$value
Search(x, index = c("gbif", "shakespeare"))$hits$total$value
## search_type
Search(x, index="shakespeare", search_type = "query_then_fetch")
Search(x, index="shakespeare", search_type = "dfs_query_then_fetch")
### search type "scan" is gone - use time_scroll instead
Search(x, index="shakespeare", time_scroll = "2m")
### search type "count" is gone - use size=0 instead
Search(x, index="shakespeare", size = 0)$hits$total$value
## search exists check
### use size set to 0 and terminate_after set to 1
### if there are > 0 hits, then there are matching documents
Search(x, index="shakespeare", size = 0, terminate_after = 1)
## sorting
### if ES >5, we need to make sure fielddata is turned on for a field 
### before using it for sort 
if (gsub("\\.", "", x$ping()$version$number) >= 500) {
 if (index_exists(x, "shakespeare")) index_delete(x, "shakespeare")
 index_create(x, "shakespeare")
 mapping_create(x, "shakespeare", body = '{
    "properties": {
      "speaker": { 
        "type":     "text",
        "fielddata": true
      }
    }
  }'
 )
 shakespeare <- system.file("examples", "shakespeare_data.json",
   package = "elastic")
 shakespeare <- type_remover(shakespeare)
 invisible(docs_bulk(x, shakespeare))
 z <- Search(x, index="shakespeare", sort="speaker", size = 30)
 vapply(z$hits$hits, function(w) w$`_source`$speaker, "")
}
if (gsub("\\.", "", x$ping()$version$number) < 500) {
  Search(x, index="shakespeare", type="line", sort="speaker:desc", 
    fields='speaker')
  Search(x, index="shakespeare", type="line",
    sort=c("speaker:desc","play_name:asc"), fields=c('speaker','play_name'))
}
## pagination
Search(x, index="shakespeare", size=1)$hits$hits
Search(x, index="shakespeare", size=1, from=1)$hits$hits
## queries
### Search in all fields
Search(x, index="shakespeare", q="york")
### Searchin specific fields
Search(x, index="shakespeare", q="speaker:KING HENRY IV")$hits$total$value
### Exact phrase search by wrapping in quotes
Search(x, index="shakespeare", q='speaker:"KING HENRY IV"')$hits$total$value
### can specify operators between multiple words parenthetically
Search(x, index="shakespeare", q="speaker:(HENRY OR ARCHBISHOP)")$hits$total$value
### where the field line_number has no value (or is missing)
Search(x, index="shakespeare", q="_missing_:line_number")$hits$total$value
### where the field line_number has any non-null value
Search(x, index="shakespeare", q="_exists_:line_number")$hits$total$value
### wildcards, either * or ?
Search(x, index="shakespeare", q="*ay")$hits$total$value
Search(x, index="shakespeare", q="m?y")$hits$total$value
### regular expressions, wrapped in forward slashes
Search(x, index="shakespeare", q="text_entry:/[a-z]/")$hits$total$value
### fuzziness
Search(x, index="shakespeare", q="text_entry:ma~")$hits$total$value
Search(x, index="shakespeare", q="text_entry:the~2")$hits$total$value
Search(x, index="shakespeare", q="text_entry:the~1")$hits$total$value
### Proximity searches
Search(x, index="shakespeare", q='text_entry:"as hath"~5')$hits$total$value
Search(x, index="shakespeare", q='text_entry:"as hath"~10')$hits$total$value
### Ranges, here where line_id value is between 10 and 20
Search(x, index="shakespeare", q="line_id:[10 TO 20]")$hits$total$value
### Grouping
Search(x, index="shakespeare", q="(hath OR as) AND the")$hits$total$value
# Limit number of hits returned with the size parameter
Search(x, index="shakespeare", size=1)
# Give explanation of search in result
Search(x, index="shakespeare", size=1, explain=TRUE)
## terminate query after x documents found
## setting to 1 gives back one document for each shard
Search(x, index="shakespeare", terminate_after=1)
## or set to other number
Search(x, index="shakespeare", terminate_after=2)
## Get version number for each document
Search(x, index="shakespeare", version=TRUE, size=2)
## Get raw data
Search(x, index="shakespeare", raw = TRUE)
## Curl options 
### verbose 
out <- Search(x, index="shakespeare", verbose = TRUE)
# Query DSL searches - queries sent in the body of the request
## Pass in as an R list
### if ES >5, we need to make sure fielddata is turned on for a field 
### before using it for aggregations 
if (gsub("\\.", "", x$ping()$version$number) >= 500) {
  mapping_create(x, "shakespeare", update_all_types = TRUE, body = '{
    "properties": {
      "text_entry": { 
        "type":     "text",
        "fielddata": true
     }
   }
 }')
 aggs <- list(aggs = list(stats = list(terms = list(field = "text_entry"))))
 Search(x, index="shakespeare", body=aggs)
}
### if ES >5, you don't need to worry about fielddata
if (gsub("\\.", "", x$ping()$version$number) < 500) {
   aggs <- list(aggs = list(stats = list(terms = list(field = "text_entry"))))
   Search(x, index="shakespeare", body=aggs)
}
## or pass in as json query with newlines, easy to read
aggs <- '{
    "aggs": {
        "stats" : {
            "terms" : {
                "field" : "speaker"
            }
        }
    }
}'
Search(x, index="shakespeare", body=aggs, asdf=TRUE, size = 0)
## or pass in collapsed json string
aggs <- '{"aggs":{"stats":{"terms":{"field":"text_entry"}}}}'
Search(x, index="shakespeare", body=aggs)
## Aggregations
### Histograms
aggs <- '{
    "aggs": {
        "latbuckets" : {
           "histogram" : {
               "field" : "decimalLatitude",
               "interval" : 5
           }
        }
    }
}'
Search(x, index="gbif", body=aggs, size=0)
### Histograms w/ more options
aggs <- '{
    "aggs": {
        "latbuckets" : {
           "histogram" : {
               "field" : "decimalLatitude",
               "interval" : 5,
               "min_doc_count" : 0,
               "extended_bounds" : {
                   "min" : -90,
                   "max" : 90
               }
           }
        }
    }
}'
Search(x, index="gbif", body=aggs, size=0)
### Ordering the buckets by their doc_count - ascending:
aggs <- '{
    "aggs": {
        "latbuckets" : {
           "histogram" : {
               "field" : "decimalLatitude",
               "interval" : 5,
               "min_doc_count" : 0,
               "extended_bounds" : {
                   "min" : -90,
                   "max" : 90
               },
               "order" : {
                   "_count" : "desc"
               }
           }
        }
    }
}'
out <- Search(x, index="gbif", body=aggs, size=0)
lapply(out$aggregations$latbuckets$buckets, data.frame)
### By default, the buckets are returned as an ordered array. It is also possible to
### request the response as a hash instead keyed by the buckets keys:
aggs <- '{
    "aggs": {
        "latbuckets" : {
           "histogram" : {
               "field" : "decimalLatitude",
               "interval" : 10,
               "keyed" : true
           }
        }
    }
}'
Search(x, index="gbif", body=aggs, size=0)
# match query
match <- '{"query": {"match" : {"text_entry" : "Two Gentlemen"}}}'
Search(x, index="shakespeare", body=match)
# multi-match (multiple fields that is) query
mmatch <- '{"query": {"multi_match" : {"query" : "henry", "fields": ["text_entry","play_name"]}}}'
Search(x, index="shakespeare", body=mmatch)
# bool query
mmatch <- '{
 "query": {
   "bool" : {
     "must_not" : {
       "range" : {
         "speech_number" : {
           "from" : 1, "to": 5
}}}}}}'
Search(x, index="shakespeare", body=mmatch)
# Boosting query
boost <- '{
 "query" : {
  "boosting" : {
      "positive" : {
          "term" : {
              "play_name" : "henry"
          }
      },
      "negative" : {
          "term" : {
              "text_entry" : "thou"
          }
      },
      "negative_boost" : 0.8
    }
 }
}'
Search(x, index="shakespeare", body=boost)
# Fuzzy query
## fuzzy query on numerics
fuzzy <- list(query = list(fuzzy = list(text_entry = "arms")))
Search(x, index="shakespeare", body=fuzzy)$hits$total$value
fuzzy <- list(query = list(fuzzy = list(text_entry = list(value = "arms", fuzziness = 4))))
Search(x, index="shakespeare", body=fuzzy)$hits$total$value
# geoshape query
## not working yets
geo <- list(query = list(geo_shape = list(location = list(shape = list(type = "envelope",
   coordinates = "[[2,10],[10,20]]")))))
geo <- '{
 "query": {
   "geo_shape": {
     "location": {
       "point": {
         "type": "envelope",
         "coordinates": [[2,0],[2.93,100]]
       }
     }
   }
 }
}'
# Search(x, index="gbifnewgeo", body=geo)
# range query
## with numeric
body <- list(query=list(range=list(decimalLongitude=list(gte=1, lte=3))))
Search(x, 'gbif', body=body)$hits$total$value
body <- list(query=list(range=list(decimalLongitude=list(gte=2.9, lte=10))))
Search(x, 'gbif', body=body)$hits$total$value
## with dates
body <- list(query=list(range=list(eventDate=list(gte="2012-01-01", lte="now"))))
Search(x, 'gbif', body=body)$hits$total$value
body <- list(query=list(range=list(eventDate=list(gte="2014-01-01", lte="now"))))
Search(x, 'gbif', body=body)$hits$total$value
# more like this query (more_like_this can be shortened to mlt)
body <- '{
 "query": {
   "more_like_this": {
     "fields": ["title"],
     "like": "and then",
     "min_term_freq": 1,
     "max_query_terms": 12
   }
 }
}'
Search(x, 'plos', body=body)$hits$total$value
body <- '{
 "query": {
   "more_like_this": {
     "fields": ["abstract","title"],
     "like": "cell",
     "min_term_freq": 1,
     "max_query_terms": 12
   }
 }
}'
Search(x, 'plos', body=body)$hits$total$value
# Highlighting
body <- '{
 "query": {
   "query_string": {
     "query" : "cell"
   }
 },
 "highlight": {
   "fields": {
     "title": {"number_of_fragments": 2}
   }
 }
}'
out <- Search(x, 'plos', body=body)
out$hits$total$value
sapply(out$hits$hits, function(x) x$`_source`$title[[1]])
### Common terms query
body <- '{
 "query" : {
   "match": {
      "text_entry": {
         "query": "this is"
      }
   }
 }
}'
Search(x, 'shakespeare', body=body)
## Scrolling search - instead of paging
res <- Search(x, index = 'shakespeare', q="a*", time_scroll="1m")
scroll(x, res$`_scroll_id`)
res <- Search(x, index = 'shakespeare', q="a*", time_scroll="5m")
out <- list()
hits <- 1
while(hits != 0){
  res <- scroll(x, res$`_scroll_id`)
  hits <- length(res$hits$hits)
  if(hits > 0)
    out <- c(out, res$hits$hits)
}
### Sliced scrolling
#### For scroll queries that return a lot of documents it is possible to 
#### split the scroll in multiple slices which can be consumed independently
body1 <- '{
  "slice": {
    "id": 0, 
    "max": 2 
  },
  "query": {
    "match" : {
      "text_entry" : "a*"
    }
  }
}'
body2 <- '{
  "slice": {
    "id": 1, 
    "max": 2 
  },
  "query": {
    "match" : {
      "text_entry" : "a*"
    }
  }
}'
res1 <- Search(x, index = 'shakespeare', time_scroll="1m", body = body1)
res2 <- Search(x, index = 'shakespeare', time_scroll="1m", body = body2)
scroll(x, res1$`_scroll_id`)
scroll(x, res2$`_scroll_id`)
out1 <- list()
hits <- 1
while(hits != 0){
  tmp1 <- scroll(x, res1$`_scroll_id`)
  hits <- length(tmp1$hits$hits)
  if(hits > 0)
    out1 <- c(out1, tmp1$hits$hits)
}
out2 <- list()
hits <- 1
while(hits != 0) {
  tmp2 <- scroll(x, res2$`_scroll_id`)
  hits <- length(tmp2$hits$hits)
  if(hits > 0)
    out2 <- c(out2, tmp2$hits$hits)
}
c(
 lapply(out1, "[[", "_source"),
 lapply(out2, "[[", "_source")
) 
# Using filters
## A bool filter
body <- '{
 "query":{
   "bool": {
     "must_not" : {
       "range" : {
         "year" : { "from" : 2011, "to" : 2012 }
       }
     }
   }
 }
}'
Search(x, 'gbif', body = body)$hits$total$value
## Geo filters - fun!
### Note that filers have many geospatial filter options, but queries 
### have fewer, andrequire a geo_shape mapping
body <- '{
 "mappings": {
     "properties": {
         "location" : {"type" : "geo_point"}
      }
   }
}'
index_recreate(x, index='gbifgeopoint', body=body)
path <- system.file("examples", "gbif_geopoint.json",
  package = "elastic")
path <- type_remover(path)
invisible(docs_bulk(x, path))
### Points within a bounding box
body <- '{
 "query":{
   "bool" : {
     "must" : {
       "match_all" : {}
     },
     "filter":{
        "geo_bounding_box" : {
          "location" : {
            "top_left" : {
              "lat" : 60,
              "lon" : 1
            },
            "bottom_right" : {
              "lat" : 40,
              "lon" : 14
            }
          }
       }
     }
   }
 }
}'
out <- Search(x, 'gbifgeopoint', body = body, size = 300)
out$hits$total$value
do.call(rbind, lapply(out$hits$hits, function(x) x$`_source`$location))
### Points within distance of a point
body <- '{
"query": {
  "bool" : {
    "must" : {
      "match_all" : {}
    },
   "filter" : {
     "geo_distance" : {
       "distance" : "200km",
       "location" : {
         "lon" : 4,
         "lat" : 50
       }
     }
  }
}}}'
out <- Search(x, 'gbifgeopoint', body = body)
out$hits$total$value
do.call(rbind, lapply(out$hits$hits, function(x) x$`_source`$location))
### Points within distance range of a point
body <- '{
 "aggs":{
   "points_within_dist" : {
     "geo_distance" : {
        "field": "location",
        "origin" : "4, 50",
        "ranges": [ 
          {"from" : 200},
          {"to" : 400}
         ]
     }
   }
 }
}'
out <- Search(x, 'gbifgeopoint', body = body)
out$hits$total$value
do.call(rbind, lapply(out$hits$hits, function(x) x$`_source`$location))
### Points within a polygon
body <- '{
 "query":{
   "bool" : {
     "must" : {
       "match_all" : {}
     },
     "filter":{
        "geo_polygon" : {
          "location" : {
             "points" : [
               [80.0, -20.0], [-80.0, -20.0], [-80.0, 60.0], [40.0, 60.0], [80.0, -20.0]
             ]
           }
         }
       }
     }
   }
}'
out <- Search(x, 'gbifgeopoint', body = body)
out$hits$total$value
do.call(rbind, lapply(out$hits$hits, function(x) x$`_source`$location))
### Geoshape filters using queries instead of filters
#### Get data with geojson type location data loaded first
body <- '{
 "mappings": {
     "properties": {
         "location" : {"type" : "geo_shape"}
      }
   }
}'
index_recreate(x, index='geoshape', body=body)
path <- system.file("examples", "gbif_geoshape.json",
  package = "elastic")
path <- type_remover(path)
invisible(docs_bulk(x, path))
#### Get data with a square envelope, w/ point defining upper left and the other
#### defining the lower right
body <- '{
 "query":{
   "geo_shape" : {
     "location" : {
         "shape" : {
           "type": "envelope",
            "coordinates": [[-30, 50],[30, 0]]
         }
       }
     }
   }
}'
out <- Search(x, 'geoshape', body = body)
out$hits$total$value
#### Get data with a circle, w/ point defining center, and radius
body <- '{
 "query":{
   "geo_shape" : {
     "location" : {
         "shape" : {
           "type": "circle",
           "coordinates": [-10, 45],
           "radius": "2000km"
         }
       }
     }
   }
}'
out <- Search(x, 'geoshape', body = body)
out$hits$total$value
#### Use a polygon, w/ point defining center, and radius
body <- '{
 "query":{
   "geo_shape" : {
     "location" : {
         "shape" : {
           "type": "polygon",
           "coordinates":  [
              [ [80.0, -20.0], [-80.0, -20.0], [-80.0, 60.0], [40.0, 60.0], [80.0, -20.0] ]
           ]
         }
       }
     }
   }
}'
out <- Search(x, 'geoshape', body = body)
out$hits$total$value
# Geofilter with WKT
# format follows "BBOX (minlon, maxlon, maxlat, minlat)"
body <- '{
    "query": {
        "bool" : {
            "must" : {
                "match_all" : {}
            },
            "filter" : {
                "geo_bounding_box" : {
                    "location" : {
                        "wkt" : "BBOX (1, 14, 60, 40)"
                    }
                }
            }
        }
    }
}'
out <- Search(x, 'gbifgeopoint', body = body)
out$hits$total$value
# Missing filter
if (gsub("\\.", "", x$ping()$version$number) < 500) {
  ### ES < v5
  body <- '{
   "query":{
     "constant_score" : {
       "filter" : {
         "missing" : { "field" : "play_name" }
       }
     }
    }
  }'
  Search(x, "shakespeare", body = body)
} else {
  ### ES => v5
  body <- '{
   "query":{
     "bool" : {
       "must_not" : {
         "exists" : { 
           "field" : "play_name" 
         }
       }
    }
   }
  }'
  Search(x, "shakespeare", body = body)
}
# prefix filter
body <- '{
 "query": {
   "bool": {
     "must": {
       "prefix" : {
         "speaker" : "we"
       }
     }
   }
 }
}'
z <- Search(x, "shakespeare", body = body)
z$hits$total$value
vapply(z$hits$hits, "[[", "", c("_source", "speaker"))
# ids filter
if (gsub("\\.", "", x$ping()$version$number) < 500) {
  ### ES < v5
  body <- '{
   "query":{
     "bool": {
       "must": {
         "ids" : {
           "values": ["1","2","10","2000"]
        }
      }
    }
   }
  }'
  z <- Search(x, "shakespeare", body = body)
  z$hits$total$value
  identical(
   c("1","2","10","2000"),
   vapply(z$hits$hits, "[[", "", "_id")
  )
} else {
  body <- '{
   "query":{
     "ids" : {
       "values": ["1","2","10","2000"]
     }
   }
  }'
  z <- Search(x, "shakespeare", body = body)
  z$hits$total$value
  identical(
   c("1","2","10","2000"),
   vapply(z$hits$hits, "[[", "", "_id")
  )
}
# combined prefix and ids filters
if (gsub("\\.", "", x$ping()$version$number) < 500) {
  ### ES < v5
  body <- '{
   "query":{
     "bool" : {
       "should" : {
         "or": [{
           "ids" : {
             "values": ["1","2","3","10","2000"]
           }
         }, {
         "prefix" : {
           "speaker" : "we"
         }
        }
      ]
     }
    }
   }
  }'
  z <- Search(x, "shakespeare", body = body)
  z$hits$total$value
} else {
  ### ES => v5
  body <- '{
   "query":{
     "bool" : {
       "should" : [
         {
           "ids" : {
             "values": ["1","2","3","10","2000"]
           }
         }, 
         {
           "prefix" : {
             "speaker" : "we"
           }
         }
      ]
     }
    }
  }'
  z <- Search(x, "shakespeare", body = body)
  z$hits$total$value
}
# Suggestions
sugg <- '{
 "query" : {
    "match" : {
      "text_entry" : "late"
     }
 },  
 "suggest" : {
   "sugg" : {
     "text" : "late",
     "term" : {
         "field" : "text_entry"
      }
    }
  }
}'
Search(x, index = "shakespeare", body = sugg, 
  asdf = TRUE, size = 0)$suggest$sugg$options
# stream data out using jsonlite::stream_out
file <- tempfile()
res <- Search(x, "shakespeare", size = 1000, stream_opts = list(file = file))
head(df <- jsonlite::stream_in(file(file)))
NROW(df)
unlink(file)
# get profile data
body <- '{
  "profile": true,
  "query" : {
    "match" : { "text_entry" : "war" }
  }
}'
res <- Search(x, "shakespeare", body = body)
res$profile
# time in nanoseconds across each of the shards
vapply(res$profile$shards, function(w) {
  w$searches[[1]]$query[[1]]$time_in_nanos
}, 1)
# }
Run the code above in your browser using DataLab