New {flexo} vignette - parsing subtitles in srt format

library(flexo)

flexo: Simple Lex/Parse Tools in R

{flexo} is a small package containing a lexer (for splitting text into tokens).

The v0.2.6 update of flexo added support for stringi regex options like multiline support, and this vignette is a quick test of this new functionality.

Parsing subtitles in srt format with flexo

The srt subtitle format is a simple representation of subtitles for video which consists of timestamped lines of text.

SRT format example

The first 10 lines of dialogue from “It’s a Wonderful Life” in srt format

srt <- "
1
00:01:25,210 --> 00:01:28,004
I owe everything to George Bailey.

2
00:01:28,422 --> 00:01:30,298
Help him, dear Father.

3
00:01:30,674 --> 00:01:33,718
Joseph, Jesus and Mary,

4
00:01:33,802 --> 00:01:36,429
help my friend Mr. Bailey.

5
00:01:36,889 --> 00:01:39,515
Help my son George tonight.

6
00:01:40,350 --> 00:01:42,226
He never thinks about himself, God.

7
00:01:42,311 --> 00:01:44,061
That's why he's in trouble.

8
00:01:44,146 --> 00:01:45,313
George is a good guy.

9
00:01:46,482 --> 00:01:47,732
Give him a break, God.

10
00:01:47,816 --> 00:01:49,942
I love him, dear Lord.
"

Lex the srt file into tokens

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the regex for each token
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
srt_regexes <- c(
  time  = "\\d+:\\d+:\\d+,\\d+",
  link  = "\\s*-->\\s*",
  index = "^\\d+$",
  text  = "^.+?$"
)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Collapse the file into a single string
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
srt <- paste(enc2utf8(srt), collapse = "\n")

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Split the file by regex, and drop the 'link' between times
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
tokens <- lex(srt, multiline = TRUE, srt_regexes)
tokens <- tokens[names(tokens) != 'link']

tokens
#>                                 index                                  time 
#>                                   "1"                        "00:01:25,210" 
#>                                  time                                  text 
#>                        "00:01:28,004"  "I owe everything to George Bailey." 
#>                                 index                                  time 
#>                                   "2"                        "00:01:28,422" 
#>                                  time                                  text 
#>                        "00:01:30,298"              "Help him, dear Father." 
#>                                 index                                  time 
#>                                   "3"                        "00:01:30,674" 
#>                                  time                                  text 
#>                        "00:01:33,718"             "Joseph, Jesus and Mary," 
#>                                 index                                  time 
#>                                   "4"                        "00:01:33,802" 
#>                                  time                                  text 
#>                        "00:01:36,429"          "help my friend Mr. Bailey." 
#>                                 index                                  time 
#>                                   "5"                        "00:01:36,889" 
#>                                  time                                  text 
#>                        "00:01:39,515"         "Help my son George tonight." 
#>                                 index                                  time 
#>                                   "6"                        "00:01:40,350" 
#>                                  time                                  text 
#>                        "00:01:42,226" "He never thinks about himself, God." 
#>                                 index                                  time 
#>                                   "7"                        "00:01:42,311" 
#>                                  time                                  text 
#>                        "00:01:44,061"         "That's why he's in trouble." 
#>                                 index                                  time 
#>                                   "8"                        "00:01:44,146" 
#>                                  time                                  text 
#>                        "00:01:45,313"               "George is a good guy." 
#>                                 index                                  time 
#>                                   "9"                        "00:01:46,482" 
#>                                  time                                  text 
#>                        "00:01:47,732"              "Give him a break, God." 
#>                                 index                                  time 
#>                                  "10"                        "00:01:47,816" 
#>                                  time                                  text 
#>                        "00:01:49,942"              "I love him, dear Lord."

Parse raw tokens into a data.frame

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Merge together runs of text
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
rl    <- unclass(rle(names(tokens)))
end   <- cumsum(rl$lengths)[rl$values == 'text']
len   <- rl$lengths[rl$values == 'text']
start <- end - len + 1

text <- mapply(function(start, end) {
  paste(tokens[start:end], collapse = "\n")
}, start, end)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Extract index and time vectors
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
indices <- as.integer(tokens[names(tokens) == 'index'])
times   <- tokens[names(tokens) == 'time']

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Munge into data.frame
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
data.frame(
  index  = indices,
  start  = times[c(T, F)],
  end    = times[c(F, T)],
  text   = text,
  stringsAsFactors = FALSE
)
#>    index        start          end                                text
#> 1      1 00:01:25,210 00:01:28,004  I owe everything to George Bailey.
#> 2      2 00:01:28,422 00:01:30,298              Help him, dear Father.
#> 3      3 00:01:30,674 00:01:33,718             Joseph, Jesus and Mary,
#> 4      4 00:01:33,802 00:01:36,429          help my friend Mr. Bailey.
#> 5      5 00:01:36,889 00:01:39,515         Help my son George tonight.
#> 6      6 00:01:40,350 00:01:42,226 He never thinks about himself, God.
#> 7      7 00:01:42,311 00:01:44,061         That's why he's in trouble.
#> 8      8 00:01:44,146 00:01:45,313               George is a good guy.
#> 9      9 00:01:46,482 00:01:47,732              Give him a break, God.
#> 10    10 00:01:47,816 00:01:49,942              I love him, dear Lord.