eafs <- dir(path = '../testcorpus/', pattern = 'eaf$', full.names = TRUE)
corpus <- eafs %>% map(read_custom_eaf) %>% bind_rows()
corpus <- corpus %>%
mutate(time_duration = time_end - time_start) %>%
mutate(audio_file = str_replace(filename, 'eaf$', 'wav')) %>%
mutate(orth_trimmed = str_replace_all(orth, c('[:punct:]' = '',
'\\s+' = ' '))) %>%
filter(! participant == 'NTP-M-1986') %>% # just getting rid of myself
select(orth_trimmed, time_start, time_end, time_duration, everything())
plot(density(corpus$time_duration))
library(exifr)
corpus %>% distinct(audio_file) %>%
pull(audio_file) %>%
map(~ exifr::read_exif(.x)) %>% bind_rows() %>%
rename(audio_file = SourceFile) %>%
select(BitsPerSample, Duration, FileType, NumChannels, everything())
## # A tibble: 3 x 18
## BitsPerSample Duration FileType NumChannels
## <int> <dbl> <chr> <int>
## 1 16 21.51046 WAV 1
## 2 16 101.82546 WAV 1
## 3 16 90.70592 WAV 1
## # ... with 14 more variables: audio_file <chr>, ExifToolVersion <dbl>,
## # FileName <chr>, Directory <chr>, FileSize <int>, FileModifyDate <chr>,
## # FileAccessDate <chr>, FileInodeChangeDate <chr>,
## # FilePermissions <int>, FileTypeExtension <chr>, MIMEType <chr>,
## # Encoding <int>, SampleRate <int>, AvgBytesPerSec <int>
library(glue)
corpus %>%
distinct(audio_file) %>%
pull(audio_file) %>%
walk(~ {
seewave::sox(glue("{.x} -c 1 {str_replace(.x, '.wav$', '-mono.wav')}"))
})
sox file.wav -c 1 file-mono.wav
cut_elan_ref <- function(audio_file, reference_id, start, duration){
if (dir.exists('../testcorpus/reference_clips') == FALSE) {
dir.create('../testcorpus/reference_clips')
}
seewave::sox(command = glue("{audio_file} ../testcorpus/reference_clips/{reference_id}.wav trim {start / 1000} {duration / 1000}"))
}
corpus %>% distinct(audio_file, ref, time_start, time_duration, orth_trimmed) %>%
split(.$ref) %>%
walk(., ~ cut_elan_ref(.x$audio_file, .x$ref, .x$time_start, .x$time_duration)) %>%
walk(., ~ write_lines(.x$orth_trimmed[1], path = glue::glue('../testcorpus/reference_clips/', .$ref[1], '.txt')))
library(emuR)
convert_txtCollection(dbName = 'testcorpus',
sourceDir = '../testcorpus/reference_clips',
targetDir = '.',
txtExtension = '.txt',
mediaFileExtension = 'wav',
attributeDefinitionName = 'orth')
dbHandle = load_emuDB('testcorpus_emuDB', verbose = F)
runBASwebservice_g2pForTokenization(handle = dbHandle,
transcriptionAttributeDefinitionName = 'orth', language = 'rus-RU',
orthoAttributeDefinitionName = 'ORT', resume = FALSE,
verbose = TRUE)
runBASwebservice_g2pForPronunciation(handle = dbHandle,
orthoAttributeDefinitionName = 'ORT',
language = 'und',
canoAttributeDefinitionName = 'KAN',
params = list(embed = 'maus', imap=RCurl::fileUpload("../testcorpus/kpv-sampa.txt")),
resume = FALSE,
verbose = TRUE)
runBASwebservice_maus(handle = dbHandle,
canoAttributeDefinitionName = 'KAN',
language = 'rus-RU',
mausAttributeDefinitionName = 'MAUS',
chunkLevel = NULL,
turnChunkLevelIntoItemLevel = TRUE,
perspective = 'default',
resume = FALSE,
verbose = TRUE)
export_TextGridCollection(dbHandle, targetDir = '../testcorpus/praat_freiburg', attributeDefinitionNames = c('ORT', 'KAN', 'MAUS'))
In case you are curious:
git clone http://github.com/langdoc/praat-stuff
What follows is somewhat complicated interaction of Praat, shell scripts and R
install.packages("shiny")
install.packages("shinydashboard")
install.packages("tidyverse")
install.packages("ggplot2")
install.packages("tuneR")
install.packages("seewave")
install.packages("forcats")
runGitHub("phoneme-viewer", "langdoc")
meow::meow
function ()
{
url <- paste0("http://thecatapi.com/api/images/get?format=src&type=jpg&size=med")
tmp <- tempfile()
dl_status <- download.file(url, tmp, quiet = TRUE, mode = "wb")
pic <- jpeg::readJPEG(tmp)
plot(1, type = "n", xlim = c(0, 1), ylim = c(0, 1), bty = "n",
xaxt = "n", yaxt = "n", xlab = "", ylab = "")
graphics::rasterImage(pic, 0, 0, 1, 1)
rm_status <- file.remove(tmp)
status <- all(!as.logical(dl_status), rm_status)
return(invisible(status))
}
<bytecode: 0x1272f51b8>
<environment: namespace:meow>