read_xml('../adv_elan_draft/notebooks/test.eaf') %>% xml_structure()
## <ANNOTATION_DOCUMENT [AUTHOR, DATE, FORMAT, VERSION, noNamespaceSchemaLocation, xmlns:xsi]>
## <HEADER [MEDIA_FILE, TIME_UNITS]>
## <MEDIA_DESCRIPTOR [MEDIA_URL, MIME_TYPE, RELATIVE_MEDIA_URL]>
## <PROPERTY [NAME]>
## {text}
## <PROPERTY [NAME]>
## {text}
## <TIME_ORDER>
## <TIME_SLOT [TIME_SLOT_ID, TIME_VALUE]>
## <TIME_SLOT [TIME_SLOT_ID, TIME_VALUE]>
## <TIME_SLOT [TIME_SLOT_ID, TIME_VALUE]>
## <TIME_SLOT [TIME_SLOT_ID, TIME_VALUE]>
## <TIME_SLOT [TIME_SLOT_ID, TIME_VALUE]>
## <TIME_SLOT [TIME_SLOT_ID, TIME_VALUE]>
## <TIME_SLOT [TIME_SLOT_ID, TIME_VALUE]>
## <TIME_SLOT [TIME_SLOT_ID, TIME_VALUE]>
## <TIER [LINGUISTIC_TYPE_REF, TIER_ID]>
## <ANNOTATION>
## <ALIGNABLE_ANNOTATION [ANNOTATION_ID, TIME_SLOT_REF1, TIME_SLOT_REF2]>
## <ANNOTATION_VALUE>
## {text}
## <ANNOTATION>
## <ALIGNABLE_ANNOTATION [ANNOTATION_ID, TIME_SLOT_REF1, TIME_SLOT_REF2]>
## <ANNOTATION_VALUE>
## {text}
## <TIER [LINGUISTIC_TYPE_REF, PARENT_REF, PARTICIPANT, TIER_ID]>
## <ANNOTATION>
## <REF_ANNOTATION [ANNOTATION_ID, ANNOTATION_REF]>
## <ANNOTATION_VALUE>
## {text}
## <ANNOTATION>
## <REF_ANNOTATION [ANNOTATION_ID, ANNOTATION_REF]>
## <ANNOTATION_VALUE>
## {text}
## <TIER [LINGUISTIC_TYPE_REF, PARTICIPANT, TIER_ID]>
## <ANNOTATION>
## <ALIGNABLE_ANNOTATION [ANNOTATION_ID, TIME_SLOT_REF1, TIME_SLOT_REF2]>
## <ANNOTATION_VALUE>
## {text}
## <ANNOTATION>
## <ALIGNABLE_ANNOTATION [ANNOTATION_ID, TIME_SLOT_REF1, TIME_SLOT_REF2]>
## <ANNOTATION_VALUE>
## {text}
## <TIER [LINGUISTIC_TYPE_REF, PARENT_REF, PARTICIPANT, TIER_ID]>
## <ANNOTATION>
## <REF_ANNOTATION [ANNOTATION_ID, ANNOTATION_REF]>
## <ANNOTATION_VALUE>
## {text}
## <ANNOTATION>
## <REF_ANNOTATION [ANNOTATION_ID, ANNOTATION_REF]>
## <ANNOTATION_VALUE>
## {text}
## <LINGUISTIC_TYPE [GRAPHIC_REFERENCES, LINGUISTIC_TYPE_ID, TIME_ALIGNABLE]>
## <LINGUISTIC_TYPE [CONSTRAINTS, GRAPHIC_REFERENCES, LINGUISTIC_TYPE_ID, TIME_ALIGNABLE]>
## <LINGUISTIC_TYPE [CONSTRAINTS, GRAPHIC_REFERENCES, LINGUISTIC_TYPE_ID, TIME_ALIGNABLE]>
## <CONSTRAINT [DESCRIPTION, STEREOTYPE]>
## <CONSTRAINT [DESCRIPTION, STEREOTYPE]>
## <CONSTRAINT [DESCRIPTION, STEREOTYPE]>
## <CONSTRAINT [DESCRIPTION, STEREOTYPE]>
library(FRelan)
read_tier(eaf_file = '../testcorpus/kpv_izva20140330-1-fragment.eaf',
linguistic_type = 'wordT')
## # A tibble: 95 x 8
## content annot_id ref_id participant tier_id type time_slot_1
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Ме a124 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## 2 , a125 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## 3 кӧнечнэ a126 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## 4 же a127 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## 5 , a128 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## 6 вӧлі a129 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## 7 кык a130 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## 8 лун a131 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## 9 в a132 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## 10 шоке a133 a6 MVF-F-1984 word@MVF-F-1984 wordT <NA>
## # ... with 85 more rows, and 1 more variables: time_slot_2 <chr>
read_tier(eaf_file = '../testcorpus/kpv_izva20140330-1-fragment.eaf',
linguistic_type = 'refT')
## # A tibble: 5 x 8
## content annot_id ref_id participant tier_id
## <chr> <chr> <chr> <chr> <chr>
## 1 kpv_izva20140330-1-b-097 a1 <NA> MVF-F-1984 ref@MVF-F-1984
## 2 kpv_izva20140330-1-b-098 a2 <NA> MVF-F-1984 ref@MVF-F-1984
## 3 kpv_izva20140330-1-b-099 a3 <NA> MVF-F-1984 ref@MVF-F-1984
## 4 kpv_izva20140330-1-b-100 a4 <NA> MVF-F-1984 ref@MVF-F-1984
## 5 kpv_izva20140330-1-b-101 a5 <NA> MVF-F-1984 ref@MVF-F-1984
## # ... with 3 more variables: type <chr>, time_slot_1 <chr>,
## # time_slot_2 <chr>
path_to_file = '../testcorpus/kpv_udo20120330SazinaJS-encounter.eaf'
ref <- FRelan::read_tier(eaf_file = path_to_file, linguistic_type = "refT") %>%
dplyr::select(content, annot_id, participant, time_slot_1, time_slot_2) %>%
dplyr::rename(ref = content) %>%
dplyr::rename(ref_id = annot_id)
orth <- FRelan::read_tier(eaf_file = path_to_file, linguistic_type = "orthT") %>%
dplyr::select(content, annot_id, ref_id, participant) %>%
dplyr::rename(orth = content) %>%
dplyr::rename(orth_id = annot_id) # %>%
# dplyr::rename(ref_id = ref_id) # This is there just as a note
token <- FRelan::read_tier(eaf_file = path_to_file, linguistic_type = "wordT") %>%
dplyr::select(content, annot_id, ref_id, participant) %>%
dplyr::rename(token = content) %>%
dplyr::rename(token_id = annot_id) %>%
dplyr::rename(orth_id = ref_id)
lemma <- FRelan::read_tier(eaf_file = path_to_file, linguistic_type = "lemmaT") %>%
dplyr::select(content, annot_id, ref_id, participant) %>%
dplyr::rename(lemma = content) %>%
dplyr::rename(lemma_id = annot_id) %>%
dplyr::rename(token_id = ref_id)
pos <- FRelan::read_tier(eaf_file = path_to_file, linguistic_type = "posT") %>%
dplyr::select(content, ref_id, participant) %>%
dplyr::rename(pos = content) %>%
dplyr::rename(lemma_id = ref_id)
elan <- left_join(ref, orth) %>%
left_join(token) %>%
left_join(lemma) %>%
left_join(pos) %>%
select(token, lemma, pos, time_slot_1, time_slot_2, everything(), -ends_with('_id'))
time_slots <- FRelan::read_timeslots(path_to_file)
corpus <- elan %>%
left_join(time_slots %>% rename(time_slot_1 = time_slot_id)) %>%
rename(time_start = time_value) %>%
left_join(time_slots %>% rename(time_slot_2 = time_slot_id)) %>%
rename(time_end = time_value) %>%
select(token, lemma, pos, participant, time_start, time_end, everything(), -starts_with('time_slot_'))
corpus
## # A tibble: 240 x 8
## token lemma pos participant time_start time_end
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 И и CC NTP-M-1986 170 3730
## 2 эшшӧ эшшӧ _ NTP-M-1986 170 3730
## 3 ӧтик ӧтик Num NTP-M-1986 170 3730
## 4 тор тор N NTP-M-1986 170 3730
## 5 , , CLB NTP-M-1986 170 3730
## 6 мый мый CS NTP-M-1986 170 3730
## 7 тэнад тэ Pron NTP-M-1986 170 3730
## 8 , , CLB NTP-M-1986 170 3730
## 9 тэныд тэ Pron NTP-M-1986 170 3730
## 10 мам мам N NTP-M-1986 170 3730
## # ... with 230 more rows, and 2 more variables: ref <chr>, orth <chr>
read_custom_eaf <- function(path_to_file){
all the code from above...
}
FRelan::read_custom_eaf(path_to_file = 'path/to/my_file.eaf')
dir(path = '../testcorpus', pattern = '.eaf$', full.names = TRUE)
## [1] "../testcorpus/kpv_izva20140330-1-fragment.eaf"
## [2] "../testcorpus/kpv_izva20140404IgusevJA-fragment.eaf"
## [3] "../testcorpus/kpv_udo20120330SazinaJS-encounter.eaf"
Or:
dir(path = '../testcorpus', pattern = '.+izva.+eaf$', full.names = TRUE)
## [1] "../testcorpus/kpv_izva20140330-1-fragment.eaf"
## [2] "../testcorpus/kpv_izva20140404IgusevJA-fragment.eaf"
elan_files <- dir(path = '../testcorpus', pattern = '.eaf$', full.names = TRUE)
elan_files %>% map(read_custom_eaf)
## [[1]]
## # A tibble: 95 x 9
## token lemma pos participant time_start time_end
## <chr> <lgl> <lgl> <chr> <dbl> <dbl>
## 1 Ме NA NA MVF-F-1984 0 6086
## 2 , NA NA MVF-F-1984 0 6086
## 3 кӧнечнэ NA NA MVF-F-1984 0 6086
## 4 же NA NA MVF-F-1984 0 6086
## 5 , NA NA MVF-F-1984 0 6086
## 6 вӧлі NA NA MVF-F-1984 0 6086
## 7 кык NA NA MVF-F-1984 0 6086
## 8 лун NA NA MVF-F-1984 0 6086
## 9 в NA NA MVF-F-1984 0 6086
## 10 шоке NA NA MVF-F-1984 0 6086
## # ... with 85 more rows, and 3 more variables: ref <chr>, orth <chr>,
## # session_name <chr>
##
## [[2]]
## # A tibble: 279 x 9
## token lemma pos participant time_start time_end
## <chr> <lgl> <lgl> <chr> <dbl> <dbl>
## 1 Значит NA NA JAI-M-1939 0 6196
## 2 , NA NA JAI-M-1939 0 6196
## 3 турун NA NA JAI-M-1939 0 6196
## 4 ми NA NA JAI-M-1939 0 6196
## 5 пуктам NA NA JAI-M-1939 0 6196
## 6 вӧлі NA NA JAI-M-1939 0 6196
## 7 Кытшыль NA NA JAI-M-1939 0 6196
## 8 коськын NA NA JAI-M-1939 0 6196
## 9 , NA NA JAI-M-1939 0 6196
## 10 квайт NA NA JAI-M-1939 0 6196
## # ... with 269 more rows, and 3 more variables: ref <chr>, orth <chr>,
## # session_name <chr>
##
## [[3]]
## # A tibble: 240 x 9
## token lemma pos participant time_start time_end
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 И и CC NTP-M-1986 170 3730
## 2 эшшӧ эшшӧ _ NTP-M-1986 170 3730
## 3 ӧтик ӧтик Num NTP-M-1986 170 3730
## 4 тор тор N NTP-M-1986 170 3730
## 5 , , CLB NTP-M-1986 170 3730
## 6 мый мый CS NTP-M-1986 170 3730
## 7 тэнад тэ Pron NTP-M-1986 170 3730
## 8 , , CLB NTP-M-1986 170 3730
## 9 тэныд тэ Pron NTP-M-1986 170 3730
## 10 мам мам N NTP-M-1986 170 3730
## # ... with 230 more rows, and 3 more variables: ref <chr>, orth <chr>,
## # session_name <chr>
elan_corpus <- elan_files %>% map(read_custom_eaf) %>% bind_rows()
meta <- dir('../testcorpus/', pattern = 'cmdi$', full.names = TRUE) %>%
map(read_cmdi) %>%
bind_rows()
test_corpus <- left_join(elan_corpus, meta) %>% left_join(read_csv('coordinates.csv'))
write_rds(test_corpus, 'test_corpus.rds')
test_corpus <- read_rds('test_corpus.rds')
source('parse_corpus.R')
test_corpus <- monster_function_that_does_everything(folder_to_go = "~/Desktop/corpus")
test_corpus %>% View
test_corpus %>% count(participant)
## # A tibble: 4 x 2
## participant n
## <chr> <int>
## 1 JAI-M-1939 275
## 2 JSS-F-1988 197
## 3 MVF-F-1984 95
## 4 NTP-M-1986 47
test_corpus %>% count(session_location)
## # A tibble: 3 x 2
## session_location n
## <chr> <int>
## 1 Diyur, Russia 95
## 2 Helsinki, Finland 240
## 3 Syktyvkar, Russia 279
test_corpus %>% count(year_birth)
## # A tibble: 4 x 2
## year_birth n
## <chr> <int>
## 1 1939 275
## 2 1984 95
## 3 1986 47
## 4 1988 197