mikefc

Challenge

  • Write R code to add 2 BigInts (arbitrarily large integers)
  • Integer input is as a string containing just the digits 0-9 e.g. "28349823749234234238077632"
  • Integer output should be the same.
  • Contributions welcome!
    • No calling out to C libraries or using Rcpp :)

Prep work: Create random large integers

  • Guarantee that there are no leading zeros (as gmp will interpret as octal)
rbigint <- function(size) {
  paste(c(
    sample(1:9, size = 1),
    sample(0:9, size = size-1, replace = TRUE)
  ), collapse='')
}

rbigint(50)
[1] "75770117627495317919250539542823349934382343361829"

Naive: for loop

  • Split each number into individual digits
  • Pad out each list of digits with zeros so that they’re the same length
  • Loop over the digits, adding them 1 pair at a time, and including any carry from the previous pair.
  • Paste it all back together
add_naive <- function(a, b) {
  # Split the numbers into individiual integer digits
  A <- as.integer(rev(strsplit(a, '')[[1]]))
  B <- as.integer(rev(strsplit(b, '')[[1]]))
  
  # Pad shorter number with zeros
  zeros <- integer(abs(length(A) - length(B)))
  if (length(A) < length(B)) {
    A <- c(A, zeros)
  } else {
    B <- c(B, zeros)
  }
  
  # Allocate a place to store the sum of each pair (plus an extra in case there's a carry)
  digits <- integer(length(A) + 1L)
  
  # Initialise carry so that it's valid for the first pair 
  carry  <- 0L
  
  # loop: add with carry
  for (i in seq_len(length(A))) {
    digit     <- A[i] + B[i] + carry
    carry     <- digit %/% 10L
    digits[i] <- digit %%  10L
  }
  digits[i+1] <- carry
  
  # Turn integer vector back into a string
  result <- paste(rev(digits), collapse = '')
  sub("^0+", '', result) # Remove leading zeros for neatness
}

Example with Naive for loop method.

set.seed(1)
a <- rbigint(20)
b <- rbigint(30)
c(a, b)
[1] "33592896602163747937"           "926123038345418671748675570476"
add_naive(a, b)
[1] "926123038379011568350839318413"
# Compare to `gmp` library 
as.character(gmp::as.bigz(a) + gmp::as.bigz(b))
[1] "926123038379011568350839318413"
Table 1: Summary: add_naive()
name value
a 33592896602163747937926123038345418671748675570476484200356492436247083833488379
b 473372712120687744866329621495973410714694417452255006955595662274171865345
add_naive 33593369974875868625670989367966914645159390264893936455363448031909358005353724
gmp 33593369974875868625670989367966914645159390264893936455363448031909358005353724
identical TRUE

Naive Vectorised

add_vectorised <- function(a, b) {
  # Split the numbers into individiual integer digits
  A <- as.integer(strsplit(a, '')[[1]])
  B <- as.integer(strsplit(b, '')[[1]])
  
  # Pad shorter number with zeros
  zeros <- integer(abs(length(A) - length(B)))
  if (length(A) < length(B)) {
    A <- c(zeros, A)
  } else {
    B <- c(zeros, B)
  }

  # Vectorised addition of each pair of digits
  digits <- A + B
  carry  <- digits %/% 10L
  digits <- digits %%  10L
  digits <- c(0L, digits)
  
  # If there's any carry, then add it in
  while (any(carry > 0L)) {
    carry  <- tail(c(carry, 0L), length(A) + 1)
    digits <- digits + carry
    carry  <- digits %/% 10L
    digits <- digits %%  10L
  }

  # Turn integer vector back into a string
  result <- paste(digits, collapse='')
  sub("^0+", '', result) # Remove leading zeros for neatness
}
Table 2: Summary: add_vectorised()
name value
a 33592896602163747937926123038345418671748675570476484200356492436247083833488379
b 473372712120687744866329621495973410714694417452255006955595662274171865345
add_vectorised 33593369974875868625670989367966914645159390264893936455363448031909358005353724
gmp 33593369974875868625670989367966914645159390264893936455363448031909358005353724
identical TRUE

Vectorised chunks

  • Rather than looking at one digit at a time, look at chunks of 8 digits at a time.
  • Any addition of a pair of chunks of this size will still fit in an R integer
add_vectorised_chunked <- function(a, b) {
  max_chunk_int <- 100000000L

  # Pad shorter number with zeros
  zeros <- paste(rep('0', abs(nchar(a) - nchar(b))), collapse='')
  if (nchar(a) < nchar(b)) {
    a <- paste0(zeros, a)
  } else {
    b <- paste0(zeros, b)
  }
  
  n <- seq(1, nchar(a), 8)
  
  # split into chunks
  A <- rev(as.integer(stringr::str_sub(a, -n-8+1, -n)))
  B <- rev(as.integer(stringr::str_sub(b, -n-8+1, -n)))
  
  # vectorised addition of chunks and calculation of carry
  digits <- A + B
  carry  <- digits %/% max_chunk_int
  digits <- digits %%  max_chunk_int
  digits <- c(0L, digits)
  
  # If there's any carry, then add it in
  while (any(carry > 0L)) {
    carry  <- tail(c(carry, 0L), length(A) + 1)
    digits <- digits + carry
    carry  <- digits %/% max_chunk_int
    digits <- digits %%  max_chunk_int
  }

  # Turn integer vector back into a string
  result <- paste(sprintf("%08i", digits), collapse='')
  sub("^0+", '', result) # Remove leading zeros for neatness
}
Table 3: Summary: add_vectorised_chunked()
name value
a 3359289660216374793792
b 612303834541867174867557047648
add_vectorised_chunked 612303837901156835083931841440
gmp 612303837901156835083931841440
identical TRUE

Benchmarking

Notes

  • Using the gmp library was always expected to be the fastest (written in C)
  • Not that much difference between the naive and vectorised code (without chunking)
  • The extra workin chunking in the vectorised addition only pays off for big integers with more than 100 digits.