git clone https://github.com/langdoc/testcorpus
- reference tier
\- transcription tier
\- token tier
\- lemma
\- pos
\- translation tier
library(devtools)
install_github('langdoc/FRelan')
read_tier()
and read_cmdi()
functionslibrary(tidyverse)
library(xml2)
corpus <- dir('../testcorpus', pattern = 'eaf$', full.names = TRUE) %>%
map(FRelan::read_eaf) %>%
bind_rows() %>%
select(token, participant, session_name, time_start, time_end, everything())
corpus
## # A tibble: 595 x 11
## token participant session_name time_start time_end
## <chr> <chr> <chr> <dbl> <dbl>
## 1 ме MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 2 , MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 3 кӧнечнэ MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 4 же MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 5 , MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 6 вӧлі MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 7 кык MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 8 лун MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 9 в MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 10 шоке MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## # ... with 585 more rows, and 6 more variables: utterance <chr>,
## # reference <chr>, filename <chr>, word <chr>, after <chr>, before <chr>
corpus %>%
filter(token == 'вӧлі')
## # A tibble: 15 x 11
## token participant session_name time_start time_end
## <chr> <chr> <chr> <dbl> <dbl>
## 1 вӧлі MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 2 вӧлі MVF-F-1984 kpv_izva20140330-1-fragment 9313 14870
## 3 вӧлі JAI-M-1939 kpv_izva20140404IgusevJA-fragment 0 6196
## 4 вӧлі JAI-M-1939 kpv_izva20140404IgusevJA-fragment 6675 9865
## 5 вӧлі JAI-M-1939 kpv_izva20140404IgusevJA-fragment 23375 27600
## 6 вӧлі JAI-M-1939 kpv_izva20140404IgusevJA-fragment 27848 32261
## 7 вӧлі JAI-M-1939 kpv_izva20140404IgusevJA-fragment 80405 83726
## 8 вӧлі JAI-M-1939 kpv_izva20140404IgusevJA-fragment 83726 85805
## 9 вӧлі NTP-M-1986 kpv_udo20120330SazinaJS-encounter 9080 9970
## 10 вӧлі JSS-F-1988 kpv_udo20120330SazinaJS-encounter 13510 17400
## 11 вӧлі JSS-F-1988 kpv_udo20120330SazinaJS-encounter 17405 19515
## 12 вӧлі JSS-F-1988 kpv_udo20120330SazinaJS-encounter 17405 19515
## 13 вӧлі JSS-F-1988 kpv_udo20120330SazinaJS-encounter 34016 34993
## 14 вӧлі JSS-F-1988 kpv_udo20120330SazinaJS-encounter 35781 38639
## 15 вӧлі JSS-F-1988 kpv_udo20120330SazinaJS-encounter 81389 82220
## # ... with 6 more variables: utterance <chr>, reference <chr>,
## # filename <chr>, word <chr>, after <chr>, before <chr>
corpus %>%
filter(lag(token) == 'татшӧм' & token == 'вӧлі') %>%
select(token, utterance, everything())
## # A tibble: 1 x 11
## token utterance participant session_name
## <chr> <chr> <chr> <chr>
## 1 вӧлі Татшӧм вӧлі. JSS-F-1988 kpv_udo20120330SazinaJS-encounter
## # ... with 7 more variables: time_start <dbl>, time_end <dbl>,
## # reference <chr>, filename <chr>, word <chr>, after <chr>, before <chr>
corpus %>% filter(lag(pos) == 'Pron' & token == 'V')
corpus %>% filter(! str_detect(token, '[[:punct:]\\p{Cyrillic}]'))
## # A tibble: 1 x 11
## token participant session_name time_start time_end
## <chr> <chr> <chr> <dbl> <dbl>
## 1 a NTP-M-1986 kpv_udo20120330SazinaJS-encounter 85947 86212
## # ... with 6 more variables: utterance <chr>, reference <chr>,
## # filename <chr>, word <chr>, after <chr>, before <chr>
corpus %>% filter(! str_detect(token, '[[:punct:]\\p{Cyrillic}]')) %>%
FRelan::open_eaf(1)
[ ]
[[:punct:]]
\\p{Cyrillic}
library(glue)
read_cmdi <- function(cmdi_file){ # this defines the function
read_xml(cmdi_file) %>% # reads the xml
xml_find_all('//cmd:Actor') %>% # finds all Actor nodes
map(~ tibble(participant = .x %>% xml_find_first('./cmd:Code') %>% xml_text,
session_name = .x %>% xml_find_first('../../cmd:Name') %>% xml_text,
year_birth = .x %>% xml_find_first('./cmd:BirthDate') %>% xml_text,
year_rec = .x %>% xml_find_first('../../cmd:Date') %>% xml_text,
role = .x %>% xml_find_first('./cmd:Role') %>% xml_text,
sex = .x %>% xml_find_first('./cmd:Sex') %>% xml_text,
session_address = .x %>% xml_find_first('../../cmd:Location/cmd:Address') %>% xml_text,
session_country = .x %>% xml_find_first('../../cmd:Location/cmd:Country') %>% xml_text,
session_location = paste0(session_address, ', ', session_country),
education = .x %>% xml_find_first('./cmd:Education') %>% xml_text,
name_full = .x %>% xml_find_first('./cmd:FullName') %>% xml_text)) %>%
bind_rows() # After everything is collected into tibble/dataframe,
# we can just bind the rows together
}
In this point we can apply the function we just wrote into all cmdi files we have.
metadata <- dir('../testcorpus', 'cmdi$', full.names = TRUE) %>%
map(read_cmdi) %>% bind_rows()
metadata
## # A tibble: 9 x 11
## participant session_name year_birth year_rec
## <chr> <chr> <chr> <chr>
## 1 MVF-F-1984 kpv_izva20140330-1-fragment 1984 2014-03-30
## 2 VCP-M-1993 kpv_izva20140330-1-fragment 1993 2014-03-30
## 3 NTP-M-1986 kpv_izva20140330-1-fragment 1986 2014-03-30
## 4 MR-M-1971 kpv_izva20140330-1-fragment 1971 2014-03-30
## 5 RB-M-1971 kpv_izva20140330-1-fragment 1971 2014-03-30
## 6 JAI-M-1939 kpv_izva20140404IgusevJA-fragment 1939 2014-04-04
## 7 NTP-M-1986 kpv_izva20140404IgusevJA-fragment 1986 2014-04-04
## 8 NTP-M-1986 kpv_udo20120330SazinaJS-encounter 1986 2012-03-30
## 9 JSS-F-1988 kpv_udo20120330SazinaJS-encounter 1988 2012-03-30
## # ... with 7 more variables: role <chr>, sex <chr>, session_address <chr>,
## # session_country <chr>, session_location <chr>, education <chr>,
## # name_full <chr>
corpus_full <- left_join(corpus, metadata)
corpus_full
## # A tibble: 595 x 20
## token participant session_name time_start time_end
## <chr> <chr> <chr> <dbl> <dbl>
## 1 ме MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 2 , MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 3 кӧнечнэ MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 4 же MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 5 , MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 6 вӧлі MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 7 кык MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 8 лун MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 9 в MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## 10 шоке MVF-F-1984 kpv_izva20140330-1-fragment 0 6086
## # ... with 585 more rows, and 15 more variables: utterance <chr>,
## # reference <chr>, filename <chr>, word <chr>, after <chr>,
## # before <chr>, year_birth <chr>, year_rec <chr>, role <chr>, sex <chr>,
## # session_address <chr>, session_country <chr>, session_location <chr>,
## # education <chr>, name_full <chr>
# coordinates <- corpus_full %>%
# distinct(session_location) %>%
# as.data.frame() %>%
# ggmap::mutate_geocode(session_location) %>%
# as_tibble()
# write_csv(coordinates, 'coordinates.csv')
coordinates <- read_csv('coordinates.csv', col_types = 'cdd')
corpus_geo <- left_join(corpus_full, coordinates) %>%
rename(lon_session = lon,
lat_session = lat)
library(leaflet)
library(htmlwidgets)
library(widgetframe)
map <- leaflet(data = corpus_geo %>% add_count(session_name)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(lng = ~lon_session,
lat = ~lat_session, radius = ~log(n),
popup = ~glue('Recording place: {session_location}</br>
Number of tokens: {n}'))
frameWidget(map)
kpv_map <- leaflet(data = kpv %>% filter(! is.na(lon_session))) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(lng = ~jitter(lon_session, 10),
lat = ~jitter(lat_session, 10),
popup = ~glue('{session_name}</br>
{title_eng}</br>
Recording place: {session_location}</br>
Number of tokens: {token_count}</br>
<a href="">Link to archive</a>'),
clusterOptions = markerClusterOptions())
frameWidget(kpv_map)