textpress is organized around four actions — fetch, read, process,
search. This vignette covers the first two. fetch_urls()
runs a search query and returns candidate URLs with metadata;
read_urls() scrapes their content into a node-level data
frame. Together they turn a search term into an analysis-ready corpus in
a few lines, which the remaining vignettes build on.
library(textpress)
library(dplyr)
library(DT)
web_urls <- textpress::fetch_urls(
query = "us polling on immigration",
n_pages = 4,
date_filter = "m"
)
metas_dt <- web_text_list$meta |>
filter(!is.na(h1_title) & nzchar(trimws(h1_title))) |>
arrange(desc(date)) |>
mutate(
title_link = paste0(
'<a href="', url, '" target="_blank">', h1_title, '</a>'
)
)
DT::datatable(
metas_dt |> select(date, source, title_link),
options = list(columnDefs = list(
list(targets = 2, orderable = FALSE)
)),
escape = FALSE,
rownames = FALSE
)