library(tidyverse)
theme_set(theme_light())
library(mgcv)
library(tidygam)The decline of local anchoring: A quantitative investigation
Packages
Read data
eng_hist <- read_csv("data/eng_hist.csv")New names:
Rows: 21558 Columns: 34
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(25): TextId, SubType, Title, Author, Genre, Cat, Locs, Locw, ft_clsMain... dbl
(7): ResId, Date, Size, Words, ft_antdist, Include, TextList lgl (2): ...32,
...33
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...32`
• `` -> `...33`
• `` -> `...34`
eng_filt <- eng_hist %>%
filter(Include == 1) %>%
mutate(
period = case_when(
EnglishPeriod == "OE" ~ 1,
EnglishPeriod == "ME" ~ 2,
EnglishPeriod == "eModE" ~ 3,
EnglishPeriod == "lModE" ~ 4
),
EnglishPeriod = factor(EnglishPeriod, levels = c("OE", "ME", "eModE", "lModE")),
Pentaset = factor(Pentaset, levels = c("Identity", "Inferred", "Assumed", "Inert", "New")),
TextId = as.factor(TextId),
cent = ifelse(
str_length(Date) == 3,
str_sub(Date, 1, 1),
str_sub(Date, 1, 2)
) %>% as.numeric() + 1,
genre = str_to_lower(Genre) %>%
str_sub(1, 3)
)
eng_count <- eng_filt %>%
group_by(EnglishPeriod, period, TextId, Pentaset, Words) %>%
count()
eng_count_date <- eng_filt %>%
group_by(cent, TextId, Pentaset, Words) %>%
count()
eng_count_genre <- eng_filt %>%
group_by(EnglishPeriod, period, TextId, Pentaset, Words, genre) %>%
count()
preps <- c("after", "at", "about", "among", "before", "between", "binnan", "by", "for", "from", "mid", "of", "on", "through", "till", "to", "upon", "without")
eng_preps <- eng_filt %>%
mutate(
ft_p_text = case_when(
ft_p_text == "ymbe" ~ "about",
ft_p_text == "ymben" ~ "about",
ft_p_text == "vmben" ~ "about",
ft_p_text == "ere" ~ "before",
ft_p_text == "in" ~ "binnan",
ft_p_text == "within" ~ "binnan",
ft_p_text == "with" ~ "mid",
ft_p_text == "until" ~ "till",
ft_p_text == "unto" ~ "to",
TRUE ~ ft_p_text
)
) %>%
filter(
ft_p_text %in% preps
)Data checks
table(is.na(eng_hist$TextId))
FALSE
21558
table(is.na(eng_hist$Pentaset))
FALSE TRUE
21557 1
table(is.na(eng_filt$TextId))
FALSE
18192
table(is.na(eng_filt$Pentaset))
FALSE
18192
eng_filt %>%
count(EnglishPeriod)# A tibble: 4 × 2
EnglishPeriod n
<fct> <int>
1 OE 6185
2 ME 4068
3 eModE 4942
4 lModE 2997
eng_filt %>%
count(Pentaset)# A tibble: 5 × 2
Pentaset n
<fct> <int>
1 Identity 5532
2 Inferred 4452
3 Assumed 475
4 Inert 2515
5 New 5218
eng_filt %>%
count(EnglishPeriod, Pentaset)# A tibble: 20 × 3
EnglishPeriod Pentaset n
<fct> <fct> <int>
1 OE Identity 2367
2 OE Inferred 1139
3 OE Assumed 157
4 OE Inert 967
5 OE New 1555
6 ME Identity 1419
7 ME Inferred 1015
8 ME Assumed 72
9 ME Inert 351
10 ME New 1211
11 eModE Identity 1208
12 eModE Inferred 1174
13 eModE Assumed 118
14 eModE Inert 421
15 eModE New 2021
16 lModE Identity 538
17 lModE Inferred 1124
18 lModE Assumed 128
19 lModE Inert 776
20 lModE New 431
eng_filt %>%
ggplot(aes(EnglishPeriod)) +
geom_bar()
eng_filt %>%
ggplot(aes(Pentaset)) +
geom_bar()
eng_filt %>%
ggplot(aes(EnglishPeriod, fill = Pentaset)) +
geom_bar() +
scale_fill_brewer(palette = "Set1")
eng_filt %>%
ggplot(aes(EnglishPeriod, fill = Pentaset)) +
geom_bar(position = "fill") +
scale_fill_brewer(palette = "Set1")
eng_filt %>%
ggplot(aes(EnglishPeriod, fill = Pentaset)) +
geom_bar(position = "dodge") +
scale_fill_brewer(palette = "Set1")
eng_count %>%
ggplot(aes(period, n, colour = Pentaset)) +
geom_jitter(alpha = 0.2) +
facet_wrap(~ Pentaset)
eng_count %>%
ggplot(aes(period, n/Words, colour = Pentaset)) +
geom_jitter(alpha = 0.2) +
facet_wrap(~ Pentaset)
eng_filt %>%
group_by(genre, Pentaset) %>%
summarise(
words = sum(Words),
n = cumsum(n()),
n_rel = n/words,
.groups = "drop"
) %>%
ggplot(aes(Pentaset, n_rel, fill = Pentaset)) +
geom_bar(stat = "identity") +
facet_wrap(genre ~.)
eng_filt %>%
group_by(ft_objNP_type, Pentaset) %>%
summarise(
words = sum(Words),
n = cumsum(n()),
n_rel = n/words,
.groups = "drop"
) %>%
ggplot(aes(Pentaset, n_rel, fill = Pentaset)) +
geom_bar(stat = "identity") +
facet_wrap(ft_objNP_type ~.)
eng_preps %>%
ggplot(aes(EnglishPeriod, fill = EnglishPeriod)) +
geom_bar() +
facet_grid(Pentaset ~ ft_p_text)
eng_preps %>%
filter(
Pentaset == "Identity",
ft_p_text %in% c("after", "at", "between", "binnan", "by", "for", "from", "mid", "of", "on", "through", "to")
) %>%
mutate(
preposition = case_when(
ft_p_text == "binnan" ~ "in",
ft_p_text == "mid" ~ "with",
TRUE ~ ft_p_text
)
) %>%
ggplot(aes(EnglishPeriod, fill = EnglishPeriod)) +
geom_bar() +
facet_wrap(~ preposition) +
labs(
x = "English period"
) +
theme(
axis.text.x = element_text(angle = 45, vjust = 0.5),
legend.position = "none"
)
ggsave("img/prep-period.png", width = 7, height = 5)GAMs
Period and Pentaset
if (file.exists("./data/cache/gam_1.rds")) {
gam_1 <- readRDS("./data/cache/gam_1.rds")
} else {
gam_1 <- bam(
n ~
Pentaset +
s(period, by = Pentaset, k = 3) +
s(TextId, bs = "re", m = 1) +
offset(log(Words/100000)),
data = eng_count,
family = poisson,
discrete = TRUE
)
saveRDS(gam_1, "./data/cache/gam_1.rds")
}predict_gam(
gam_1, series = "period",
values = c("Words" = 1e+5),
exclude_terms = "s(TextId)",
length_out = 50, tran_fun = exp
) %>%
plot(comparison = "Pentaset") +
geom_vline(xintercept = c(1:4), linetype = "dotted") +
scale_color_brewer(type = "qual", palette = "Set1") +
scale_fill_brewer(type = "qual", palette = "Set1") +
scale_x_continuous(labels = c("OE", "ME", "eModE", "lModE"), minor_breaks = NULL) +
labs(
y = "Number of PP per 100k words"
)
ggsave("img/period-penta.png", width = 7, height = 5)We fitted a generalised additive mixed-effects model to the number of prepositional phrases, using a Poisson distribution. We included the following terms (in parentheses, an explanation of how the term contributes to the model): Pentaset (Identity, Inferred, Assumed, Inert, New), as a parametric term (average number of prepositional phrases according to Pentaset), a smooth over Period (OE, ME, eModE, lModE) by Pentaset (change in number of prepositional phrases over Period by Pentaset), and a by-text factor smooth over period (to account for variations between texts). An offset term was also included to account for the fact that length (in words) differed across texts. The reported estimates are the number of prepositional phrases assuming a text length of 100k words.
The number of Identity PPs decreases from Old English to Late Modern English, while Inferred PPs show a less pronounced increase, especially after Middle English. The number of Inert PPs seems to decrease from Old to Middle English, while it increases again by Late Modern English. The opposite pattern can be observed in the New PPs: after increasing from Old to Middle English, they decrease again by Late Modern English. Assumed PPs are overall less frequent and don’t show clear patterns of change through time, perhaps with the exception of a small increase between Early and Late Modern English.
Century and Pentaset
if (file.exists("./data/cache/gam_2.rds")) {
gam_2 <- readRDS("./data/cache/gam_2.rds")
} else {
gam_2 <- bam(
n ~
Pentaset +
s(cent, by = Pentaset, k = 4) +
s(TextId, bs = "re", m = 1) +
offset(log(Words/100000)),
data = eng_count_date,
family = poisson,
discrete = TRUE
)
saveRDS(gam_2, "./data/cache/gam_2.rds")
}predict_gam(
gam_2, series = "cent",
values = c("Words" = 1e+5),
exclude_terms = "s(TextId)",
length_out = 50, tran_fun = exp
) %>%
plot(comparison = "Pentaset") +
scale_color_brewer(type = "qual", palette = "Set1") +
scale_fill_brewer(type = "qual", palette = "Set1") +
labs(
y = "Number of PP per 100k words"
)
ggsave("img/cent-penta.png", width = 7, height = 5)We fitted a generalised additive mixed-effects model to the number of prepositional phrases, using a Poisson distribution. We included the following terms (in parentheses, an explanation of how the term contributes to the model): Pentaset (Identity, Inferred, Assumed, Inert, New), as a parametric term (average number of prepositional phrases according to Pentaset), a smooth over Century by Pentaset (change in number of prepositional phrases over Period by Pentaset), and a by-text factor smooth over period (to account for variations between texts). An offset term was also included to account for the fact that length (in words) differed across texts. The reported estimates are the number of prepositional phrases assuming a text length of 100k words.
The results are virtually identical to the model fitted with English Period. The number of Identity PPs decreases from Old English to Late Modern English, while Inferred PPs show a less pronounced increase, especially after Middle English. The number of Inert PPs seems to decrease from Old to Middle English, while it increases again by Late Modern English. The opposite pattern can be observed in the New PPs: after increasing from Old to Middle English, they decrease again by Late Modern English. Assumed PPs are overall less frequent and don’t show clear patterns of change through time, perhaps with the exception of a small increase between Early and Late Modern English.