## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>", eval = FALSE)

## ----setup--------------------------------------------------------------------
# library(crawlee)

## -----------------------------------------------------------------------------
# result <- crawler("https://books.toscrape.com/") |>
#   cr_on_html(function(ctx) {
#     ctx$push_data(list(url = ctx$request$url))
#   }) |>
#   cr_run() |>
#   cr_collect()

## -----------------------------------------------------------------------------
# crawler("https://books.toscrape.com/") |>
#   cr_dataset(backend = "duckdb", path = "books.duckdb") |>
#   cr_on_html(function(ctx) ctx$push_data(list(url = ctx$request$url))) |>
#   cr_run()

## -----------------------------------------------------------------------------
# crawler("https://example.com/report.pdf") |>
#   cr_store("downloads") |>
#   cr_on_pdf(function(ctx) {
#     ctx$push_data(list(url = ctx$request$url, pages = length(ctx$pdf_text())))
#     ctx$save_body(ext = "pdf") # -> downloads/<sanitised-url>.pdf
#   }) |>
#   cr_run()

## -----------------------------------------------------------------------------
# crawl <- crawler("https://books.toscrape.com/") |>
#   cr_persist("runs/books", dataset = "duckdb") |>
#   cr_on_html(function(ctx) {
#     ctx$push_data(list(url = ctx$request$url))
#     ctx$enqueue_links(glob = "*/catalogue/*")
#   }) |>
#   cr_run()
# 
# data <- cr_collect(crawl)
# cr_close(crawl) # release the DuckDB connection

## -----------------------------------------------------------------------------
# # Same code as above: it resumes instead of starting over.
# crawler("https://books.toscrape.com/") |>
#   cr_persist("runs/books", dataset = "duckdb") |>
#   cr_on_html(function(ctx) {
#     ctx$push_data(list(url = ctx$request$url))
#     ctx$enqueue_links(glob = "*/catalogue/*")
#   }) |>
#   cr_run()

