library(flexo)
flexo
: Simple Lex/Parse Tools in R
{flexo} is a small package containing a lexer (for splitting text into tokens).
The v0.2.6 update of flexo
added support for stringi
regex options like multiline
support,
and this vignette is a quick test of this new functionality.
Parsing subtitles in srt format with flexo
The srt subtitle format is a simple representation of subtitles for video which consists of timestamped lines of text.
SRT format example
The first 10 lines of dialogue from “It’s a Wonderful Life” in srt
format
srt <- "
1
00:01:25,210 --> 00:01:28,004
I owe everything to George Bailey.
2
00:01:28,422 --> 00:01:30,298
Help him, dear Father.
3
00:01:30,674 --> 00:01:33,718
Joseph, Jesus and Mary,
4
00:01:33,802 --> 00:01:36,429
help my friend Mr. Bailey.
5
00:01:36,889 --> 00:01:39,515
Help my son George tonight.
6
00:01:40,350 --> 00:01:42,226
He never thinks about himself, God.
7
00:01:42,311 --> 00:01:44,061
That's why he's in trouble.
8
00:01:44,146 --> 00:01:45,313
George is a good guy.
9
00:01:46,482 --> 00:01:47,732
Give him a break, God.
10
00:01:47,816 --> 00:01:49,942
I love him, dear Lord.
"
Lex the srt file into tokens
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the regex for each token
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
srt_regexes <- c(
time = "\\d+:\\d+:\\d+,\\d+",
link = "\\s*-->\\s*",
index = "^\\d+$",
text = "^.+?$"
)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Collapse the file into a single string
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
srt <- paste(enc2utf8(srt), collapse = "\n")
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Split the file by regex, and drop the 'link' between times
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
tokens <- lex(srt, multiline = TRUE, srt_regexes)
tokens <- tokens[names(tokens) != 'link']
tokens
#> index time
#> "1" "00:01:25,210"
#> time text
#> "00:01:28,004" "I owe everything to George Bailey."
#> index time
#> "2" "00:01:28,422"
#> time text
#> "00:01:30,298" "Help him, dear Father."
#> index time
#> "3" "00:01:30,674"
#> time text
#> "00:01:33,718" "Joseph, Jesus and Mary,"
#> index time
#> "4" "00:01:33,802"
#> time text
#> "00:01:36,429" "help my friend Mr. Bailey."
#> index time
#> "5" "00:01:36,889"
#> time text
#> "00:01:39,515" "Help my son George tonight."
#> index time
#> "6" "00:01:40,350"
#> time text
#> "00:01:42,226" "He never thinks about himself, God."
#> index time
#> "7" "00:01:42,311"
#> time text
#> "00:01:44,061" "That's why he's in trouble."
#> index time
#> "8" "00:01:44,146"
#> time text
#> "00:01:45,313" "George is a good guy."
#> index time
#> "9" "00:01:46,482"
#> time text
#> "00:01:47,732" "Give him a break, God."
#> index time
#> "10" "00:01:47,816"
#> time text
#> "00:01:49,942" "I love him, dear Lord."
Parse raw tokens into a data.frame
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Merge together runs of text
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
rl <- unclass(rle(names(tokens)))
end <- cumsum(rl$lengths)[rl$values == 'text']
len <- rl$lengths[rl$values == 'text']
start <- end - len + 1
text <- mapply(function(start, end) {
paste(tokens[start:end], collapse = "\n")
}, start, end)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Extract index and time vectors
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
indices <- as.integer(tokens[names(tokens) == 'index'])
times <- tokens[names(tokens) == 'time']
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Munge into data.frame
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
data.frame(
index = indices,
start = times[c(T, F)],
end = times[c(F, T)],
text = text,
stringsAsFactors = FALSE
)
#> index start end text
#> 1 1 00:01:25,210 00:01:28,004 I owe everything to George Bailey.
#> 2 2 00:01:28,422 00:01:30,298 Help him, dear Father.
#> 3 3 00:01:30,674 00:01:33,718 Joseph, Jesus and Mary,
#> 4 4 00:01:33,802 00:01:36,429 help my friend Mr. Bailey.
#> 5 5 00:01:36,889 00:01:39,515 Help my son George tonight.
#> 6 6 00:01:40,350 00:01:42,226 He never thinks about himself, God.
#> 7 7 00:01:42,311 00:01:44,061 That's why he's in trouble.
#> 8 8 00:01:44,146 00:01:45,313 George is a good guy.
#> 9 9 00:01:46,482 00:01:47,732 Give him a break, God.
#> 10 10 00:01:47,816 00:01:49,942 I love him, dear Lord.