Parsing chess game files in PGN format

Chess game format: pgn

The pgn file format is a human readable representation of a chess game.

In its most basic form, it consists of

a sequence of tags (i.e. comments) surrounded by []
a sequence of numbers and events representing the moves taken by the players i.e.
- A number indicating which move this is within the game.
- Moves the for the white and black player represented in Standard Algebraic Notation (SAN).
Comments can be interspersed between/within the moves and are surrounded by “{}”

An example pgn file is show below:

pgn_text <- '
[Event "F/S Return Match"]
[Site "Belgrade, Serbia JUG"]
[Date "1992.11.04"]
[Round "29"]
[White "Fischer, Robert J."]
[Black "Spassky, Boris V."]
[Result "1/2-1/2"]

1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 {This opening is called the Ruy Lopez.}
4. Ba4 Nf6 5. O-O Be7 6. Re1 b5 7. Bb3 d6 8. c3 O-O 9. h3 Nb8 10. d4 Nbd7
11. c4 c6 12. cxb5 axb5 13. Nc3 Bb7 14. Bg5 b4 15. Nb1 h6 16. Bh4 c5 17. dxe5
Nxe4 18. Bxe7 Qxe7 19. exd6 Qf6 20. Nbd2 Nxd6 21. Nc4 Nxc4 22. Bxc4 Nb6
23. Ne5 Rae8 24. Bxf7+ Rxf7 25. Nxf7 Rxe1+ 26. Qxe1 Kxf7 27. Qe3 Qg5 28. Qxg5
hxg5 29. b3 Ke6 30. a3 Kd6 31. axb4 cxb4 32. Ra5 Nd5 33. f3 Bc8 34. Kf2 Bf5
35. Ra7 g6 36. Ra6+ Kc5 37. Ke1 Nf4 38. g3 Nxh3 39. Kd2 Kb5 40. Rd6 Kc5 41. Ra6
Nf2 42. g4 Bd3 43. Re6 1/2-1/2
'

Use `lex()` to turn the text into tokens

Start by defining the regular expression patterns for each element in the pgn file.
Use flexo::lex() to turn the pgn text into tokens
Throw away whitespace, newlines and tags, since I’m not interested in them.

pgn_regexes <- c(
  comment       = '(;.*?)\n',       # Assume ; only appears to denote comment to end of line
  tag           = '\\[.*?\\]',    # parse tags as a whole token. going to ignore
  comment_open  = "\\{",          # Inline comment start
  comment_close = "\\}",          # Inline comment end
  move_number   = "\\d+\\.+",
  symbol        = '[-+\\w\\./]+',
  newline       = '\n',
  whitespace    = '\\s+'
)

tokens <- flexo::lex(pgn_text, pgn_regexes)
tokens <- tokens[!(names(tokens) %in% c('whitespace', 'newline'))]
tokens[1:23]

##                               tag                               tag 
##    "[Event \"F/S Return Match\"]" "[Site \"Belgrade, Serbia JUG\"]" 
##                               tag                               tag 
##           "[Date \"1992.11.04\"]"                  "[Round \"29\"]" 
##                               tag                               tag 
##  "[White \"Fischer, Robert J.\"]"   "[Black \"Spassky, Boris V.\"]" 
##                               tag                       move_number 
##            "[Result \"1/2-1/2\"]"                              "1." 
##                            symbol                            symbol 
##                              "e4"                              "e5" 
##                       move_number                            symbol 
##                              "2."                             "Nf3" 
##                            symbol                       move_number 
##                             "Nc6"                              "3." 
##                            symbol                            symbol 
##                             "Bb5"                              "a6" 
##                      comment_open                            symbol 
##                               "{"                            "This" 
##                            symbol                            symbol 
##                         "opening"                              "is" 
##                            symbol                            symbol 
##                          "called"                             "the" 
##                            symbol 
##                             "Ruy"

Use `TokenStream` to help turn the tokens into coherent data.frame

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Initialise a TokenStream object so I can manipulate the stream of tokens
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
stream <- TokenStream$new(tokens)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' Fast-forward the stream to the first 'move_number'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
tags <- stream$consume_until(name = 'move_number', inclusive = FALSE)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Extract a full move (i.e. player1 and player2 moves) from a stream.
# function assumes that stream is positioned at start of a move.
# Although if streams is starting on the cpening of a comment, then the 
# comment will be consumed and discarded
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
extract_full_move <- function(stream) {
  move_num <- stream$consume(1)
  
  if (names(move_num) == 'comment_open') {
    stream$consume_until(name = 'comment_close')
    move_num <- stream$consume(1)
  }
  
  moves    <- c()
  
  while (length(moves) < 2 && !stream$end_of_stream()) {
    tok <- stream$consume(1)
    if (names(tok) == 'symbol') {
      moves <- c(moves, tok)  
    } else if (names(tok) == 'comment_open') {
      stream$consume_until(name = 'comment_close')
    }
  }
  data.frame(
    move    = unname(move_num), 
    player1 = unname(moves[1]), 
    player2 = unname(moves[2])
  )
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Extract all the moves
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moves_list <- list()
while(!stream$end_of_stream()) {
  moves_list <- append(moves_list, list(extract_full_move(stream)))
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# create the final data.frame of moves
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
do.call('rbind', moves_list)

##    move player1 player2
## 1    1.      e4      e5
## 2    2.     Nf3     Nc6
## 3    3.     Bb5      a6
## 4    4.     Ba4     Nf6
## 5    5.     O-O     Be7
## 6    6.     Re1      b5
## 7    7.     Bb3      d6
## 8    8.      c3     O-O
## 9    9.      h3     Nb8
## 10  10.      d4    Nbd7
## 11  11.      c4      c6
## 12  12.    cxb5    axb5
## 13  13.     Nc3     Bb7
## 14  14.     Bg5      b4
## 15  15.     Nb1      h6
## 16  16.     Bh4      c5
## 17  17.    dxe5    Nxe4
## 18  18.    Bxe7    Qxe7
## 19  19.    exd6     Qf6
## 20  20.    Nbd2    Nxd6
## 21  21.     Nc4    Nxc4
## 22  22.    Bxc4     Nb6
## 23  23.     Ne5    Rae8
## 24  24.   Bxf7+    Rxf7
## 25  25.    Nxf7   Rxe1+
## 26  26.    Qxe1    Kxf7
## 27  27.     Qe3     Qg5
## 28  28.    Qxg5    hxg5
## 29  29.      b3     Ke6
## 30  30.      a3     Kd6
## 31  31.    axb4    cxb4
## 32  32.     Ra5     Nd5
## 33  33.      f3     Bc8
## 34  34.     Kf2     Bf5
## 35  35.     Ra7      g6
## 36  36.    Ra6+     Kc5
## 37  37.     Ke1     Nf4
## 38  38.      g3    Nxh3
## 39  39.     Kd2     Kb5
## 40  40.     Rd6     Kc5
## 41  41.     Ra6     Nf2
## 42  42.      g4     Bd3
## 43  43.     Re6 1/2-1/2

mikefc

2021-12-07

Chess game format: pgn

Use `lex()` to turn the text into tokens

Use `TokenStream` to help turn the tokens into coherent data.frame

Parsing chess game files in PGN format

mikefc

2021-12-07

Chess game format: pgn

Use lex() to turn the text into tokens

Use TokenStream to help turn the tokens into coherent data.frame

Use `lex()` to turn the text into tokens

Use `TokenStream` to help turn the tokens into coherent data.frame