With a corpus built and split into sentences, the simplest form of search is pattern matching. This vignette uses search_regex() across a generational-politics corpus to show what well-constructed regex actually buys you — age range constructions, turnout language, numeric change expressions — and how the results come back in a KWIC format ready for close reading or downstream aggregation.

Search terms and web URLs

search_terms <- c(
  "US Gen Z voters 2026",
  "US Millennial political party 2026",
  "US Gen X politics forgotten generation 2026",
  "US Baby Boomer Republican Democrat 2026"
)
library(textpress)
library(dplyr)
library(DT)

web_urls <- lapply(search_terms, function(x)
  textpress::fetch_urls(query = x, 
                        n_pages = 3, 
                        date_filter = 'm')) |>
  bind_rows() |>
  unique()

Web text and sentence split

web_text_list <- web_urls |>
  filter(path_depth > 0) |>
  pull(url) |>
  textpress::read_urls(cores = 6) 

web_text <- web_text_list$text |>
  mutate(doc_id = match(url, unique(url))) |>
  relocate(doc_id, .before = 1)
web_ss <- web_text |>
  textpress::nlp_split_sentences(by = c('doc_id', 'node_id'))

Regex patterns

patterns <- list(
  energized = "\\b(?:energi[zs]ed|motivated|mobili[zs]ed|fired\\s+up|disillusioned|apathetic)\\b",
age_range = paste(
  "\\b(?:aged?(?:\\s+between)?|ages?)\\s+\\d{2,3}(?:(?:\\s*(?:[-–]|to|and)\\s*\\d{2,3})|(?:\\s+and\\s+(?:older|younger|over|under)))\\b",
  "\\bunder\\s+\\d{2}(?!\\d|,|%)\\b",
  "\\b\\d{2,3}(?!\\d|,|%)\\s*(?:\\+|and\\s+(?:older|younger|over|under))\\b",
  sep = "|"
),
  from_to   = "\\bfrom\\s+\\d+\\s+to\\s+\\d+\\b"
)

Search results

Search ~ age_range

fs <- web_ss |> 
  
  textpress::search_regex(
    query = patterns$age_range,
    by = c('doc_id', 'node_id'),
    highlight = c('<span style="background:#a6cbe1;">', '</span>')
    ) |>
  
  distinct(text, .keep_all = TRUE) |>
  select(doc_id, node_id, pattern, text)

if (!is.null(fs) && nrow(fs) > 0) fs |> DT::datatable(rownames = FALSE, escape = FALSE)

Search ~ from_to

fs1 <- web_ss |> 
  
  textpress::search_regex(
    query = patterns$from_to,
    by = c('doc_id', 'node_id'),
    highlight = c('<span style="background:#fdc4a8;">', '</span>')
    ) |>
  
  distinct(text, .keep_all = TRUE) |>
  select(doc_id, node_id, pattern, text)

if (!is.null(fs1) && nrow(fs1) > 0) fs1 |> DT::datatable(rownames = FALSE, escape = FALSE)