We have been given a quest by the VGAM Lord Thomas Yee (t-yeezy) to improve some of the matrix multiplication in VGAM.
The particular problem is transforming a band-format matrix, doing Cholesky decomposition for each of the resulting square matrices, and multiplying it with a design matrix.
We will use an ordinal multinomial example using the pneumo
data frame, which contains 8 rows (observations) and 4 columns. The model fits the ordinal response category (normal
< mild
< severe
) as a function of the let = log(exposure.time)
.
From what I think I understand, we’re using a cumulative link function (default is log
link in acat
).
library( devtools )
## Loading required package: usethis
library( VGAM )
## Loading required package: stats4
## Loading required package: splines
example_output = capture.output( example(acat) )
# Dataset
pneumo
## exposure.time normal mild severe let
## 1 5.8 98 0 0 1.757858
## 2 15.0 51 2 1 2.708050
## 3 21.5 34 6 3 3.068053
## 4 27.5 35 5 8 3.314186
## 5 33.5 32 10 9 3.511545
## 6 39.5 23 7 8 3.676301
## 7 46.0 12 6 10 3.828641
## 8 51.5 4 2 5 3.941582
# Model
fit
##
## Call:
## vglm(formula = cbind(normal, mild, severe) ~ let, family = acat,
## data = pneumo)
##
##
## Coefficients:
## (Intercept):1 (Intercept):2 let:1 let:2
## -8.9360297 -3.0390622 2.1653729 0.9020936
##
## Degrees of Freedom: 16 Total; 12 Residual
## Residual deviance: 5.347382
## Log-likelihood: -25.25054
##
## This is an adjacent categories model with 3 levels
We now extract the model dimensions and the working weights.
M <- npred(fit)
nn <- nobs(fit)
wzedd <- weights(fit, type = "working")
# UU is matrix-band format
UU <- vchol(wzedd, M = M, n = nn, silent = TRUE)
X.vlm <- model.matrix(fit, type = "vlm")
UU.X.vlm <- mux111(cc = UU, xmat = X.vlm, M = M)
UU.X.vlm
## (Intercept):1 (Intercept):2 let:1 let:2
## 1:1 0.8398341 0.1591424 1.476309 0.2797498
## 1:2 0.0000000 0.3303302 0.000000 0.5806735
## 2:1 1.8376577 0.6527830 4.976469 1.7677693
## 2:2 0.0000000 0.9105143 0.000000 2.4657183
## 3:1 2.3485868 1.0159178 7.205589 3.1168895
## 3:2 0.0000000 1.2628877 0.000000 3.8746064
## 4:1 3.0058420 1.4658393 9.961919 4.8580639
## 4:2 0.0000000 1.7365937 0.000000 5.7553944
## 5:1 3.4291766 1.8247533 12.041709 6.4077042
## 5:2 0.0000000 2.1398457 0.000000 7.5141655
## 6:1 3.0752930 1.7494893 11.305702 6.4316488
## 6:2 0.0000000 2.0851612 0.000000 7.6656795
## 7:1 2.6207089 1.5782477 10.033754 6.0425443
## 7:2 0.0000000 1.9529582 0.000000 7.4771765
## 8:1 1.5901746 0.9960147 6.267803 3.9258735
## 8:2 0.0000000 1.2854459 0.000000 5.0666903
The underlying computations in vchol
appear to be:
# Convert rows of weights matrix to square matrix
# Matrix-band format
wzar <- m2a(wzedd, M = M)
wzar[,,1]
## [,1] [,2]
## [1,] 0.7053214 0.1336533
## [2,] 0.1336533 0.1344443
# Do the Cholesky decomposition
chol(wzar[,,1]) # Same as UU[, 1]
## [,1] [,2]
## [1,] 0.8398341 0.1591424
## [2,] 0.0000000 0.3303302
# Notes:
# vchol also has a thing where it does a correction if the decomposition fails
Inside mux111
, it appears to reconstruct the square matrices from UU
to do the matrix multiplication with the corresponding rows of X.vlm
. Some notes from Thomas:
mux111 <- function(cc, xmat, M, upper = TRUE) {
# 19990813
#
# 20170605: some modifications to get hdeff() going.
# Input:
# 1. cc is dimm x n, where dimm is
# usually M or M(M+1)/2, e.g., is output from vchol().
# That is, cc is some matrix in matrix-band format.
# 2. xmat is n*M x R. Each sub-block is M x R and is
# multiplied by each column of cc.
#
# Output:
# 1. (n*M) x R matrix, such that, in old notation,
# ans[1:M, ] <- cc[, , 1] %*% xmat[1:M, ], etc.
#
# Notes:
# 1. It is specifically geared to vglm.fit(), and is a special case
# of mux11().
# 2. If M = 1 then cc cannot be simply be a vector.
# 20170605; not sure about the above line.
# 3. NAs in xmat and/or cc are okay.
# 4. cc represents an upper-triangular matrix by default, but if not
# then it is symmetric (set upper = FALSE).
if (!is.matrix(xmat))
xmat <- as.matrix(xmat)
if (!is.matrix(cc))
cc <- t(as.matrix(cc))
R <- ncol(xmat)
n <- nrow(xmat) / M
index <- iam(NA, NA, M, both = TRUE, diag = TRUE)
dimm.value <- nrow(cc) # Usually M or M(M+1)/2
fred <- .C("mux111ccc",
as.double(cc), b = as.double(t(xmat)),
as.integer(M), as.integer(R), as.integer(n),
wkcc = double(M * M), wk2 = double(M * R),
as.integer(index$row), as.integer(index$col),
as.integer(dimm.value),
as.integer(upper), NAOK = TRUE)
ans <- fred$b
dim(ans) <- c(R, n * M)
d <- dimnames(xmat)
dimnames(ans) <- list(d[[2]], d[[1]])
t(ans)
}
Make a list of solutions for testing.
VGAM_solutions = list(
wzedd = wzedd,
wzar = wzar,
UU = UU,
UU.X.vlm = UU.X.vlm
)
wzedd
) -> square matrix (vchol
) -> matrix-band (UU
), we can skip the last one by making a version of mux111
that can accept an array of square matrices.armadillo
to abstract away the single/double loops as vector operations to improve readability/maintainability.library( Rcpp )
sourceCpp( "cpp_implementations.cpp" )
wz_weights = wweights( fit, matrix.arg = TRUE, deriv.arg = FALSE )
N = nobs(fit)
pred_dim = npred(fit)
index <- iam(NA, NA, pred_dim, both = TRUE, diag = TRUE)
# We begin with a solution that does a double loop.
# First loop iterates over N observations
# Assigns diagonals from matrix band to the return array
# Second loop iterates to assign the non-diagonal elements
wzar_cpp = m2a_cpp( wz_weights, pred_dim, 0,
index$row.index-1, index$col.index-1 )
all( wzar_cpp == VGAM_solutions$wzar )
## [1] TRUE
# Alternative solution for one row that uses matrix indexing
N = nrow( wz_weights )
n_triangular_elements = 1/2 * ( sqrt( 8 * ncol(wz_weights) + 1 ) - 1 )
index_matrix = cbind( index$row.index, index$col.index )
wzar_cpp_single = sapply( 1:N, function( N_ ){
result = m2a_cpp_1( wz_weights, t(index_matrix-1), 0,
n_triangular_elements, N_ - 1 )
result == VGAM_solutions$wzar[,,N_]
} )
all( wzar_cpp_single )
## [1] TRUE
# Extend this result by looping over N in c++
wzar_cpp_all = m2a_cpp_x( wz_weights, t(index_matrix-1), 0,
n_triangular_elements, N )
all( wzar_cpp_all == VGAM_solutions$wzar )
## [1] TRUE
# An alternative to above, but instead of indexing the
# return array as x.slice.index, we try to use the cube indexing
wzar_cpp_all2 = m2a_cpp_x2( wz_weights, rbind(t(index_matrix-1),0), 0,
n_triangular_elements, N )
all( wzar_cpp_all == VGAM_solutions$wzar )
## [1] TRUE
Time for some benchmarking:
# sourceCpp( "cpp_implementations.cpp" )
wz_weights = wweights( fit, matrix.arg = TRUE, deriv.arg = FALSE )
N = 8
microbenchmark::microbenchmark(
# Standard
VGAM = m2a(wz_weights, M = pred_dim),
# c++, but with double loop
cpp1 = m2a_cpp( wz_weights, pred_dim, 0,
index$row.index-1, index$col.index-1 ),
cpp12 = m2a_cpp_new( wz_weights, pred_dim, 0,
index$row.index-1, index$col.index-1 ),
# c++ but using matrix indexing
cpp2 = m2a_cpp_x( wz_weights, t(index_matrix-1), 0,
n_triangular_elements, N ),
# c++ but using cube indexing
cpp3 = m2a_cpp_x2( wz_weights, rbind(t(index_matrix-1),0), 0,
n_triangular_elements, N ),
cpp4 = m2a_cpp_x3( wz_weights, rbind(t(index_matrix-1),0), 0,
n_triangular_elements, N ),
times = 10000
)
## Unit: microseconds
## expr min lq mean median uq max neval
## VGAM 39.2 44.6 64.71365 47.8 63.3 5005.5 10000
## cpp1 3.7 4.5 7.15740 5.0 6.1 3169.4 10000
## cpp12 3.2 3.9 6.01339 4.3 5.4 1876.3 10000
## cpp2 7.2 8.2 12.31069 8.8 10.1 4195.4 10000
## cpp3 8.7 9.7 13.77574 10.6 12.3 1302.9 10000
## cpp4 8.5 9.5 13.55257 10.4 12.1 1143.2 10000
So, it looks like using the matrix/cube indexing is not faster. I think it is because I am taking non-contiguous matrix subviews. It’s too hard to change this, so I’ll just leave it. There is room to improve the m2a_cpp
because I’m not very good at c++.
I should have looked at the scaling. It turns out if we scale up the input (band format) matrix, our fast solutions turn out to be really slow.
Submatrix views are nice to program, but get very slow when N increases.
wz_weights = wz_weights[ rep( 1: 8, each = 10000 ), ]
N = nrow(wz_weights)
microbenchmark::microbenchmark(
# Standard
VGAM = m2a(wz_weights, M = pred_dim),
# c++, but with double loop
cpp1 = m2a_cpp( wz_weights, pred_dim, 0,
index$row.index-1, index$col.index-1 ),
cpp12 = m2a_cpp_new( wz_weights, pred_dim, 0,
index$row.index-1, index$col.index-1 ),
# c++ but using matrix indexing
# cpp2 = m2a_cpp_x( wz_weights, t(index_matrix-1), 0,
# n_triangular_elements, N ),
# c++ but using cube indexing
# cpp3 = m2a_cpp_x2( wz_weights, rbind(t(index_matrix-1),0), 0,
# n_triangular_elements, N ),
# cpp4 = m2a_cpp_x3( wz_weights, rbind(t(index_matrix-1),0), 0,
# n_triangular_elements, N ),
times = 100
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## VGAM 2.2727 2.6430 4.475432 3.16635 5.17215 13.4041 100
## cpp1 21.0820 22.6625 25.869747 24.04800 27.89590 46.3528 100
## cpp12 3.4784 4.1511 5.968601 4.75235 5.70915 71.1899 100
Next part of our solution is to separate the design matrix into components for matrix multiplication with the chol(wz_weights)
.
wz_weights = wweights( fit, matrix.arg = TRUE, deriv.arg = FALSE )
N = nrow( wz_weights )
xmat = model.matrix(fit, type = "vlm")
xmat_array = xmat_to_array( xmat, pred_dim )
xmat_array[,,1:2]
## , , 1
##
## [,1] [,2] [,3] [,4]
## [1,] 1 0 1.757858 0.000000
## [2,] 0 1 0.000000 1.757858
##
## , , 2
##
## [,1] [,2] [,3] [,4]
## [1,] 1 0 2.70805 0.00000
## [2,] 0 1 0.00000 2.70805
head(xmat)
## (Intercept):1 (Intercept):2 let:1 let:2
## 1:1 1 0 1.757858 0.000000
## 1:2 0 1 0.000000 1.757858
## 2:1 1 0 2.708050 0.000000
## 2:2 0 1 0.000000 2.708050
## 3:1 1 0 3.068053 0.000000
## 3:2 0 1 0.000000 3.068053
Now, we want to do the Cholesky decomposition for each wz_weights
submatrix and then multiply the xmat
submatrices.
UU.X.vlm_cpp = do.call( rbind, lapply( 1:N, function(n_){
chol(wzar_cpp[,,n_]) %*% xmat_array[,,n_]
} ) )
# As required
all( UU.X.vlm == UU.X.vlm_cpp )
## [1] TRUE
And we just do it in c++ instead of R.
UU.X.vlm_cpp2 = mux111ccc_cpp( wz_weights, xmat, pred_dim, 0,
index$row.index-1, index$col.index-1 )
# As required
all( UU.X.vlm_cpp2 == UU.X.vlm )
## [1] TRUE
Some benchmarking
microbenchmark::microbenchmark(
VGAM = mux111(cc = UU, xmat = X.vlm, M = M),
VGAM2 = {
UU <- vchol(wzedd, M = M, n = nn, silent = TRUE)
mux111(cc = UU, xmat = X.vlm, M = M)
},
cpp = mux111ccc_cpp( wz_weights, xmat, pred_dim, 0,
index$row.index-1, index$col.index-1 ),
times = 1000
)
## Unit: microseconds
## expr min lq mean median uq max neval
## VGAM 47.6 51.0 93.8386 53.5 61.90 28192.9 1000
## VGAM2 89.6 97.1 128.5661 101.0 124.75 6462.1 1000
## cpp 7.0 8.7 15.3344 10.6 12.10 2418.9 1000
We see that the c++ implementation is faster. It also is doing the Cholesky decomposition, whereas it is precomputed for the VGAM solution as UU
.
library( VGAM )
library( Rcpp )
sourceCpp( "cpp_implementations.cpp" )
pneumo <- transform(pneumo, let = log(exposure.time))
big_data_vglm = vglm(formula = cbind(normal, mild, severe) ~ let,
family = acat,
data = pneumo[rep(1:8, 1000),])
big_M <- npred(big_data_vglm)
big_nn <- nobs(big_data_vglm)
big_wzedd <- weights(big_data_vglm, type = "working")
# UU is matrix-band format
big_UU <- vchol(big_wzedd, M = big_M, n = big_nn, silent = TRUE)
big_X.vlm <- model.matrix(big_data_vglm, type = "vlm")
big_UU.X.vlm <- mux111(cc = big_UU, xmat = big_X.vlm, M = big_M)
index <- iam(NA, NA, big_M, both = TRUE, diag = TRUE)
N = nrow( big_wzedd )
n_triangular_elements = 1/2 * ( sqrt( 8 * ncol(big_wzedd) + 1 ) - 1 )
index_matrix = cbind( index$row.index, index$col.index )
microbenchmark::microbenchmark(
# VGAM = mux111(cc = big_UU, xmat = big_X.vlm, M = big_M),
VGAM2 = {
UU <- vchol(big_wzedd, M = big_M, n = big_nn, silent = TRUE)
mux111(cc = big_UU, xmat = big_X.vlm, M = big_M)
},
# cpp = mux111ccc_cpp( big_wzedd, big_X.vlm, big_M, 0,
# index$row.index-1, index$col.index-1 ),
cpp_ = mux111ccc_cpp_( big_wzedd, big_X.vlm, big_M, 0,
index$row.index-1, index$col.index-1 ),
# cpp2 = mux111ccc_cpp2( big_wzedd, big_X.vlm, big_M, t(index_matrix-1), 0,
# n_triangular_elements, N ),
times = 100
)
## Unit: microseconds
## expr min lq mean median uq max neval
## VGAM2 956.4 1057.15 1448.306 1146.7 1464.20 5645.4 100
## cpp_ 1613.2 1883.65 2336.108 2157.8 2621.65 5535.6 100