Introduction

We have been given a quest by the VGAM Lord Thomas Yee (t-yeezy) to improve some of the matrix multiplication in VGAM.

The particular problem is transforming a band-format matrix, doing Cholesky decomposition for each of the resulting square matrices, and multiplying it with a design matrix.

Existing solution

We will use an ordinal multinomial example using the pneumo data frame, which contains 8 rows (observations) and 4 columns. The model fits the ordinal response category (normal < mild < severe) as a function of the let = log(exposure.time).

From what I think I understand, we’re using a cumulative link function (default is log link in acat).

library( devtools )
## Loading required package: usethis
library( VGAM )
## Loading required package: stats4
## Loading required package: splines
example_output = capture.output( example(acat) )

# Dataset
pneumo
##   exposure.time normal mild severe      let
## 1           5.8     98    0      0 1.757858
## 2          15.0     51    2      1 2.708050
## 3          21.5     34    6      3 3.068053
## 4          27.5     35    5      8 3.314186
## 5          33.5     32   10      9 3.511545
## 6          39.5     23    7      8 3.676301
## 7          46.0     12    6     10 3.828641
## 8          51.5      4    2      5 3.941582
# Model
fit
## 
## Call:
## vglm(formula = cbind(normal, mild, severe) ~ let, family = acat, 
##     data = pneumo)
## 
## 
## Coefficients:
## (Intercept):1 (Intercept):2         let:1         let:2 
##    -8.9360297    -3.0390622     2.1653729     0.9020936 
## 
## Degrees of Freedom: 16 Total; 12 Residual
## Residual deviance: 5.347382 
## Log-likelihood: -25.25054 
## 
## This is an adjacent categories model with 3 levels

We now extract the model dimensions and the working weights.

M <- npred(fit)
nn <- nobs(fit)
wzedd <- weights(fit, type = "working")
# UU is matrix-band format
UU <- vchol(wzedd, M = M, n = nn, silent = TRUE)
X.vlm <- model.matrix(fit, type = "vlm")
UU.X.vlm <- mux111(cc = UU, xmat = X.vlm, M = M)
UU.X.vlm
##     (Intercept):1 (Intercept):2     let:1     let:2
## 1:1     0.8398341     0.1591424  1.476309 0.2797498
## 1:2     0.0000000     0.3303302  0.000000 0.5806735
## 2:1     1.8376577     0.6527830  4.976469 1.7677693
## 2:2     0.0000000     0.9105143  0.000000 2.4657183
## 3:1     2.3485868     1.0159178  7.205589 3.1168895
## 3:2     0.0000000     1.2628877  0.000000 3.8746064
## 4:1     3.0058420     1.4658393  9.961919 4.8580639
## 4:2     0.0000000     1.7365937  0.000000 5.7553944
## 5:1     3.4291766     1.8247533 12.041709 6.4077042
## 5:2     0.0000000     2.1398457  0.000000 7.5141655
## 6:1     3.0752930     1.7494893 11.305702 6.4316488
## 6:2     0.0000000     2.0851612  0.000000 7.6656795
## 7:1     2.6207089     1.5782477 10.033754 6.0425443
## 7:2     0.0000000     1.9529582  0.000000 7.4771765
## 8:1     1.5901746     0.9960147  6.267803 3.9258735
## 8:2     0.0000000     1.2854459  0.000000 5.0666903

The underlying computations in vchol appear to be:

# Convert rows of weights matrix to square matrix
    # Matrix-band format
wzar <- m2a(wzedd, M = M)
wzar[,,1]
##           [,1]      [,2]
## [1,] 0.7053214 0.1336533
## [2,] 0.1336533 0.1344443
# Do the Cholesky decomposition
chol(wzar[,,1])  # Same as UU[, 1]
##           [,1]      [,2]
## [1,] 0.8398341 0.1591424
## [2,] 0.0000000 0.3303302
# Notes:
    # vchol also has a thing where it does a correction if the decomposition fails

Inside mux111, it appears to reconstruct the square matrices from UU to do the matrix multiplication with the corresponding rows of X.vlm. Some notes from Thomas:

mux111 <- function(cc, xmat, M, upper = TRUE) {
# 19990813
#
# 20170605: some modifications to get hdeff() going.

# Input:
# 1. cc is dimm x n, where dimm is
#    usually M or M(M+1)/2, e.g., is output from vchol().
#    That is, cc is some matrix in matrix-band format.
# 2. xmat is n*M x R. Each sub-block is M x R and is
#    multiplied by each column of cc.
#
# Output:
# 1. (n*M) x R matrix, such that, in old notation,
#    ans[1:M, ] <- cc[, , 1] %*% xmat[1:M, ], etc.
#
# Notes:
# 1. It is specifically geared to vglm.fit(), and is a special case
#    of mux11().
# 2. If M = 1 then cc cannot be simply be a vector.
#    20170605; not sure about the above line.
# 3. NAs in xmat and/or cc are okay.
# 4. cc represents an upper-triangular matrix by default, but if not
#    then it is symmetric (set upper = FALSE).

   if (!is.matrix(xmat))
     xmat <- as.matrix(xmat)
   if (!is.matrix(cc))
     cc <- t(as.matrix(cc))
   R <- ncol(xmat)
   n <- nrow(xmat) / M
   index <- iam(NA, NA, M, both = TRUE, diag = TRUE)
   dimm.value <- nrow(cc)  # Usually M or M(M+1)/2

   fred <- .C("mux111ccc",
              as.double(cc), b = as.double(t(xmat)),
              as.integer(M), as.integer(R), as.integer(n),
              wkcc = double(M * M), wk2 = double(M * R),
              as.integer(index$row), as.integer(index$col),
              as.integer(dimm.value),
              as.integer(upper), NAOK = TRUE)

   ans <- fred$b
   dim(ans) <- c(R, n * M)
   d <- dimnames(xmat)
   dimnames(ans) <- list(d[[2]], d[[1]])
   t(ans)
}

Make a list of solutions for testing.

VGAM_solutions = list(
    wzedd = wzedd,
    wzar = wzar,
    UU = UU,
    UU.X.vlm = UU.X.vlm
)

Proposed solutions

  • Instead of doing the double conversion of matrix-band (wzedd) -> square matrix (vchol) -> matrix-band (UU), we can skip the last one by making a version of mux111 that can accept an array of square matrices.
  • Use armadillo to abstract away the single/double loops as vector operations to improve readability/maintainability.
library( Rcpp )
sourceCpp( "cpp_implementations.cpp" )

wz_weights = wweights( fit, matrix.arg = TRUE, deriv.arg = FALSE )
N = nobs(fit)
pred_dim = npred(fit)
index <- iam(NA, NA, pred_dim, both = TRUE, diag = TRUE)

# We begin with a solution that does a double loop.
    # First loop iterates over N observations
        # Assigns diagonals from matrix band to the return array
    # Second loop iterates to assign the non-diagonal elements
wzar_cpp = m2a_cpp( wz_weights, pred_dim, 0, 
    index$row.index-1, index$col.index-1 )

all( wzar_cpp == VGAM_solutions$wzar )
## [1] TRUE
# Alternative solution for one row that uses matrix indexing
N = nrow( wz_weights )
n_triangular_elements = 1/2 * ( sqrt( 8 * ncol(wz_weights) + 1 ) - 1 )
index_matrix = cbind( index$row.index, index$col.index )

wzar_cpp_single = sapply( 1:N, function( N_ ){
    result = m2a_cpp_1( wz_weights, t(index_matrix-1), 0,
        n_triangular_elements, N_ - 1 )
    result == VGAM_solutions$wzar[,,N_]
} )

all( wzar_cpp_single )
## [1] TRUE
# Extend this result by looping over N in c++
wzar_cpp_all = m2a_cpp_x( wz_weights, t(index_matrix-1), 0,
    n_triangular_elements, N )
all( wzar_cpp_all == VGAM_solutions$wzar )
## [1] TRUE
# An alternative to above, but instead of indexing the
    # return array as x.slice.index, we try to use the cube indexing
wzar_cpp_all2 = m2a_cpp_x2( wz_weights, rbind(t(index_matrix-1),0), 0,
    n_triangular_elements, N )
all( wzar_cpp_all == VGAM_solutions$wzar )
## [1] TRUE

Time for some benchmarking:

# sourceCpp( "cpp_implementations.cpp" )

wz_weights = wweights( fit, matrix.arg = TRUE, deriv.arg = FALSE )
N = 8

microbenchmark::microbenchmark(
    # Standard
    VGAM = m2a(wz_weights, M = pred_dim),
    # c++, but with double loop
    cpp1 = m2a_cpp( wz_weights, pred_dim, 0, 
        index$row.index-1, index$col.index-1 ),
    cpp12 = m2a_cpp_new( wz_weights, pred_dim, 0, 
        index$row.index-1, index$col.index-1 ),
    # c++ but using matrix indexing
    cpp2 = m2a_cpp_x( wz_weights, t(index_matrix-1), 0,
        n_triangular_elements, N ),
    # c++ but using cube indexing
    cpp3 = m2a_cpp_x2( wz_weights, rbind(t(index_matrix-1),0), 0,
        n_triangular_elements, N ),
    cpp4 = m2a_cpp_x3( wz_weights, rbind(t(index_matrix-1),0), 0,
        n_triangular_elements, N ),
    times = 10000
)
## Unit: microseconds
##   expr  min   lq     mean median   uq    max neval
##   VGAM 39.2 44.6 64.71365   47.8 63.3 5005.5 10000
##   cpp1  3.7  4.5  7.15740    5.0  6.1 3169.4 10000
##  cpp12  3.2  3.9  6.01339    4.3  5.4 1876.3 10000
##   cpp2  7.2  8.2 12.31069    8.8 10.1 4195.4 10000
##   cpp3  8.7  9.7 13.77574   10.6 12.3 1302.9 10000
##   cpp4  8.5  9.5 13.55257   10.4 12.1 1143.2 10000

So, it looks like using the matrix/cube indexing is not faster. I think it is because I am taking non-contiguous matrix subviews. It’s too hard to change this, so I’ll just leave it. There is room to improve the m2a_cpp because I’m not very good at c++.

Other issues

I should have looked at the scaling. It turns out if we scale up the input (band format) matrix, our fast solutions turn out to be really slow.

Submatrix views are nice to program, but get very slow when N increases.

wz_weights = wz_weights[ rep( 1: 8, each = 10000 ), ]
N = nrow(wz_weights)

microbenchmark::microbenchmark(
    # Standard
    VGAM = m2a(wz_weights, M = pred_dim),
    # c++, but with double loop
    cpp1 = m2a_cpp( wz_weights, pred_dim, 0, 
        index$row.index-1, index$col.index-1 ),
    cpp12 = m2a_cpp_new( wz_weights, pred_dim, 0, 
        index$row.index-1, index$col.index-1 ),
    # c++ but using matrix indexing
    # cpp2 = m2a_cpp_x( wz_weights, t(index_matrix-1), 0,
    #     n_triangular_elements, N ),
    # c++ but using cube indexing
    # cpp3 = m2a_cpp_x2( wz_weights, rbind(t(index_matrix-1),0), 0,
    #     n_triangular_elements, N ),
    # cpp4 = m2a_cpp_x3( wz_weights, rbind(t(index_matrix-1),0), 0,
    #     n_triangular_elements, N ),
    times = 100
)
## Unit: milliseconds
##   expr     min      lq      mean   median       uq     max neval
##   VGAM  2.2727  2.6430  4.475432  3.16635  5.17215 13.4041   100
##   cpp1 21.0820 22.6625 25.869747 24.04800 27.89590 46.3528   100
##  cpp12  3.4784  4.1511  5.968601  4.75235  5.70915 71.1899   100

Proposed solutions 2

Next part of our solution is to separate the design matrix into components for matrix multiplication with the chol(wz_weights).

wz_weights = wweights( fit, matrix.arg = TRUE, deriv.arg = FALSE )
N = nrow( wz_weights )

xmat = model.matrix(fit, type = "vlm")
xmat_array = xmat_to_array( xmat, pred_dim )
xmat_array[,,1:2]
## , , 1
## 
##      [,1] [,2]     [,3]     [,4]
## [1,]    1    0 1.757858 0.000000
## [2,]    0    1 0.000000 1.757858
## 
## , , 2
## 
##      [,1] [,2]    [,3]    [,4]
## [1,]    1    0 2.70805 0.00000
## [2,]    0    1 0.00000 2.70805
head(xmat)
##     (Intercept):1 (Intercept):2    let:1    let:2
## 1:1             1             0 1.757858 0.000000
## 1:2             0             1 0.000000 1.757858
## 2:1             1             0 2.708050 0.000000
## 2:2             0             1 0.000000 2.708050
## 3:1             1             0 3.068053 0.000000
## 3:2             0             1 0.000000 3.068053

Proposed solution 3

Now, we want to do the Cholesky decomposition for each wz_weights submatrix and then multiply the xmat submatrices.

UU.X.vlm_cpp = do.call( rbind, lapply( 1:N, function(n_){
    chol(wzar_cpp[,,n_]) %*% xmat_array[,,n_]
} ) )
# As required
all( UU.X.vlm == UU.X.vlm_cpp )
## [1] TRUE

And we just do it in c++ instead of R.

UU.X.vlm_cpp2 = mux111ccc_cpp( wz_weights, xmat, pred_dim, 0,
    index$row.index-1, index$col.index-1 )
# As required
all( UU.X.vlm_cpp2 == UU.X.vlm )
## [1] TRUE

Some benchmarking

microbenchmark::microbenchmark(
    VGAM = mux111(cc = UU, xmat = X.vlm, M = M),
    VGAM2 = {
        UU <- vchol(wzedd, M = M, n = nn, silent = TRUE)
        mux111(cc = UU, xmat = X.vlm, M = M)
    },
    cpp = mux111ccc_cpp( wz_weights, xmat, pred_dim, 0,
    index$row.index-1, index$col.index-1 ),
    times = 1000
)
## Unit: microseconds
##   expr  min   lq     mean median     uq     max neval
##   VGAM 47.6 51.0  93.8386   53.5  61.90 28192.9  1000
##  VGAM2 89.6 97.1 128.5661  101.0 124.75  6462.1  1000
##    cpp  7.0  8.7  15.3344   10.6  12.10  2418.9  1000

We see that the c++ implementation is faster. It also is doing the Cholesky decomposition, whereas it is precomputed for the VGAM solution as UU.

Big dataset

library( VGAM )
library( Rcpp )
sourceCpp( "cpp_implementations.cpp" )

pneumo <- transform(pneumo, let = log(exposure.time))

big_data_vglm = vglm(formula = cbind(normal, mild, severe) ~ let, 
    family = acat, 
    data = pneumo[rep(1:8, 1000),])
big_M <- npred(big_data_vglm)
big_nn <- nobs(big_data_vglm)
big_wzedd <- weights(big_data_vglm, type = "working")
# UU is matrix-band format
big_UU <- vchol(big_wzedd, M = big_M, n = big_nn, silent = TRUE)
big_X.vlm <- model.matrix(big_data_vglm, type = "vlm")
big_UU.X.vlm <- mux111(cc = big_UU, xmat = big_X.vlm, M = big_M)
index <- iam(NA, NA, big_M, both = TRUE, diag = TRUE)

N = nrow( big_wzedd )
n_triangular_elements = 1/2 * ( sqrt( 8 * ncol(big_wzedd) + 1 ) - 1 )
index_matrix = cbind( index$row.index, index$col.index )

microbenchmark::microbenchmark(
    # VGAM = mux111(cc = big_UU, xmat = big_X.vlm, M = big_M),
    VGAM2 = {
        UU <- vchol(big_wzedd, M = big_M, n = big_nn, silent = TRUE)
        mux111(cc = big_UU, xmat = big_X.vlm, M = big_M)
    },
    # cpp = mux111ccc_cpp( big_wzedd, big_X.vlm, big_M, 0,
    #     index$row.index-1, index$col.index-1 ),
    cpp_ = mux111ccc_cpp_( big_wzedd, big_X.vlm, big_M, 0,
        index$row.index-1, index$col.index-1 ),
    # cpp2 = mux111ccc_cpp2( big_wzedd, big_X.vlm, big_M, t(index_matrix-1), 0,
    #     n_triangular_elements, N ),
    times = 100
)
## Unit: microseconds
##   expr    min      lq     mean median      uq    max neval
##  VGAM2  956.4 1057.15 1448.306 1146.7 1464.20 5645.4   100
##   cpp_ 1613.2 1883.65 2336.108 2157.8 2621.65 5535.6   100