textpress is organized around four actions — fetch, read, process, search. This vignette covers the first two. fetch_urls() runs a search query and returns candidate URLs with metadata; read_urls() scrapes their content into a node-level data frame. Together they turn a search term into an analysis-ready corpus in a few lines, which the remaining vignettes build on.

Search and read

library(textpress)
library(dplyr)
library(DT)

web_urls <- textpress::fetch_urls(
  query       = "us polling on immigration",
  n_pages     = 4,
  date_filter = "m"
)
web_text_list <- web_urls |>
  filter(path_depth > 0) |>
  pull(url) |>
  textpress::read_urls(cores = 4)
metas_dt <- web_text_list$meta |>
  filter(!is.na(h1_title) & nzchar(trimws(h1_title))) |>
  arrange(desc(date)) |>
  mutate(
    title_link = paste0(
      '<a href="', url, '" target="_blank">', h1_title, '</a>'
    )
  )

DT::datatable(
  metas_dt |> select(date, source, title_link),
  options = list(columnDefs = list(
    list(targets = 2, orderable = FALSE)
  )),
  escape   = FALSE,
  rownames = FALSE
)