Read HTML headers and text from file
Value
A data frame with a column called text
and header columns called title
, part
, section
, and subsection
as needed. Header columns are limited to page elements tagged as h1, h2, h3, or h4.
Examples
if (FALSE) {
library(dplyr)
library(stringr)
library(tmtyro)
orlando <-
"http://gutenberg.net.au/ebooks02/0200331h.html" |>
download_once() |>
parse_html() |>
filter(str_detect(part, "CHAPTER")) |>
mutate(
chapter = str_extract(part, "\\d"),
author = "Virginia Woolf") |>
select(author, title, chapter, text) |>
drop_na(chapter) |>
identify_by(title, chapter) |>
load_texts()
}