With a corpus built and split into sentences, the simplest form of
search is pattern matching. This vignette uses
search_regex() across a generational-politics corpus to
show what well-constructed regex actually buys you — age range
constructions, turnout language, numeric change expressions — and how
the results come back in a KWIC format ready for close reading or
downstream aggregation.
search_terms <- c(
"US Gen Z voters 2026",
"US Millennial political party 2026",
"US Gen X politics forgotten generation 2026",
"US Baby Boomer Republican Democrat 2026"
)
web_text_list <- web_urls |>
filter(path_depth > 0) |>
pull(url) |>
textpress::read_urls(cores = 6)
web_text <- web_text_list$text |>
mutate(doc_id = match(url, unique(url))) |>
relocate(doc_id, .before = 1)
web_ss <- web_text |>
textpress::nlp_split_sentences(by = c('doc_id', 'node_id'))
patterns <- list(
energized = "\\b(?:energi[zs]ed|motivated|mobili[zs]ed|fired\\s+up|disillusioned|apathetic)\\b",
age_range = paste(
"\\b(?:aged?(?:\\s+between)?|ages?)\\s+\\d{2,3}(?:(?:\\s*(?:[-–]|to|and)\\s*\\d{2,3})|(?:\\s+and\\s+(?:older|younger|over|under)))\\b",
"\\bunder\\s+\\d{2}(?!\\d|,|%)\\b",
"\\b\\d{2,3}(?!\\d|,|%)\\s*(?:\\+|and\\s+(?:older|younger|over|under))\\b",
sep = "|"
),
from_to = "\\bfrom\\s+\\d+\\s+to\\s+\\d+\\b"
)age_range
fs <- web_ss |>
textpress::search_regex(
query = patterns$age_range,
by = c('doc_id', 'node_id'),
highlight = c('<span style="background:#a6cbe1;">', '</span>')
) |>
distinct(text, .keep_all = TRUE) |>
select(doc_id, node_id, pattern, text)
if (!is.null(fs) && nrow(fs) > 0) fs |> DT::datatable(rownames = FALSE, escape = FALSE)from_to
fs1 <- web_ss |>
textpress::search_regex(
query = patterns$from_to,
by = c('doc_id', 'node_id'),
highlight = c('<span style="background:#fdc4a8;">', '</span>')
) |>
distinct(text, .keep_all = TRUE) |>
select(doc_id, node_id, pattern, text)
if (!is.null(fs1) && nrow(fs1) > 0) fs1 |> DT::datatable(rownames = FALSE, escape = FALSE)