Data checks

Author

Stefano Coretta

Beta not Gaussian

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(coretta2018itaegg)
library(Hmisc)

Attaching package: 'Hmisc'

The following objects are masked from 'package:dplyr':

    src, summarize

The following objects are masked from 'package:base':

    format.pval, units
library(brms)
Loading required package: Rcpp
Loading 'brms' package (version 2.22.0). Useful instructions
can be found by typing help('brms'). A more detailed introduction
to the package is available through vignette('brms_overview').

Attaching package: 'brms'

The following object is masked from 'package:stats':

    ar
set.seed(8923)
x <- rbeta(1000, 4, 0.5)

plot(density(x))

bg <- brm(
  x ~ 1,
  data = data.frame(x),
  file = "data/cache/bg"
)

pp_check(bg, ndraws = 10)

bb <- brm(
  x ~ 1,
  family = Beta,
  data = data.frame(x),
  file = "data/cache/bb"
)

pp_check(bb, ndraws = 10)

Voicing within closure

data("ita_egg")
ita_egg |> 
  ggplot(aes(voi_clo_prop)) +
  geom_density()
Warning: Removed 805 rows containing non-finite outside the scale range
(`stat_density()`).

ita_egg |> 
  filter(voi_clo_prop > 0, voi_clo_prop < 1) |> 
  ggplot(aes(voi_clo_prop)) +
  geom_density() +
  geom_rug()

ita_egg_filt <- ita_egg |> 
  filter(voi_clo_prop > 0, voi_clo_prop < 1)
ita_egg_filt |> 
  ggplot(aes(vowel, voi_clo_prop)) +
  geom_jitter(alpha = 0.1, width = 0.2) +
  stat_summary(fun.data = "mean_cl_boot")

ita_egg_filt |> 
  ggplot(aes(vowel, voi_clo_prop)) +
  geom_jitter(alpha = 0.2, width = 0.2, aes(colour = vowel)) +
  stat_summary(fun.data = "mean_cl_boot") +
  facet_grid(cols = vars(c2))

ita_egg_filt |> 
  ggplot(aes(vowel, voi_clo_prop)) +
  stat_summary(fun.data = "mean_cl_boot") +
  facet_grid(cols = vars(c2))

ita_egg_filt |> 
  ggplot(aes(speech_rate, voi_clo_prop)) + 
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm") +
  facet_wrap(vars(vowel))
`geom_smooth()` using formula = 'y ~ x'

Vowel nasalisation

nasal <- read_csv("https://raw.githubusercontent.com/ChristopherCarignan/journal-articles/refs/heads/master/rtMRI-velum/velum_data.csv") |> 
  filter(
    stress == "N",
    vowel %in% c("a_", "E_", "I_", "O_", "U_")
  ) |> 
  mutate(
    voicing = case_when(
      post %in% c("nt__", "nt_@", "nt_6", "nt_a") ~ "voiceless",
      post %in% c("nd_@", "nd_6", "nd_a") ~ "voiced"
    ),
    vowel = str_to_lower(vowel) |> str_remove("_")
  ) |> 
  # drop codas not included in the analysis
  drop_na(voicing) |> 
  mutate(
    nas_dur = (Vokal_off - velumopening_maxvel_on) * 1000,
    nas_prop = nas_dur / (Vokal_dur * 1000),
    NC = ifelse(voicing == "voiceless", "nt", "nd")
  ) |> 
  # drop observations with wrong fMRI tracking
  filter(nas_prop > 0, nas_prop < 1) |> 
  select(
    speaker, label, vowel, NC, voicing, nas_prop
  )
Rows: 7151 Columns: 42
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (8): speaker, label, prev, vowel, post, stress, nasality, word
dbl (34): velum2US_Vokal_min, alvUS_Vokal_min, alv2US_Vokal_min, alvUS_Coda_...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
write_csv(nasal, "data/carignan2021/nasal.csv")
nasal |> 
  ggplot(aes(nas_prop, fill = voicing)) +
  geom_density(alpha = 0.5)

nasal |> 
  ggplot(aes(nas_prop, fill = voicing)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~vowel)