An Introduction to HMLasso

Masaaki Takada, Toshiba Corporation

2019-08-02

We introduce a simple regression problem, and compare the performance of mean imputation, CoCoLasso, and HMLasso. It takes several minutes to run this vignette because of our simple implementation. To see the details of HMLasso, please refer to the following paper.

Takada, M., Fujisawa, H., & Nishikawa, T. (2019). “HMLasso: Lasso with High Missing Rate.” IJCAI. <arXiv:1811.00255>.

NOTE: Because of the simple implementation, less than 200 variables are desirable from the perspective of computation efficiency.

Setup

First, we load libraries.

knitr::opts_chunk$set(echo = TRUE)
library(hmlasso)
library(MASS)

Data Preparation

Next, we define a simple regression problem.

# setting
n <- 2000 # number of samples
p <- 20 # number of dimensions
r <- 0.5 # correlation levels
eps <- 1 # noise level

# construct covariance matrix
mu <- rep(0, length=p)
Sigma <- matrix(r, nrow=p, ncol=p) # correlation among variables is 0
diag(Sigma) <- 1 # variance of each variable is 1

# generate training data
set.seed(0)
X <- mvrnorm(n, mu, Sigma)
b <- matrix(0, nrow = p, ncol = 1)
b[1:10,] <- (-1)^(1:10) * (1:10)
y <- X %*% b + eps * rnorm(n)
colnames(X) <- paste0("V", 1:p)
rownames(b) <- paste0("V", 1:p)

# introduce missing data
X_incompl <- X
mrc <- runif(p, min=0.1, max=0.9) # missing rate of each column
for (j in 1:p) {
  mr <- sample(1:nrow(X), round(nrow(X) * mrc[j]))
  X_incompl[mr, j] <- NA
}
image(t(apply(apply(X_incompl, c(1,2), function(v){ifelse(is.na(v), 1, 0)}), 2, rev)),
      col=grey(11:0/12), main="missing pattern") # visualize (black:na, white:fill)

# generate test data
n_ts <- 10000
X_ts <- mvrnorm(n_ts, mu, Sigma)
y_ts <- X_ts %*% b + eps * rnorm(n_ts)
colnames(X_ts) <- paste0("V", 1:p)

# setup cv fold
nfolds <- 5
foldid1 <- sample(rep(1:nfolds, (n %/% nfolds)), replace=FALSE)
foldid2 <- sample(1:nfolds, (n %% nfolds), replace=FALSE)
foldid <- c(foldid1, foldid2)

Mean Imputation

We apply mean imputation method to the problem.

# MeanImp
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1,
                     foldid=foldid, impute_method="mean",
                     direct_prediction=FALSE, positify="mean")
plot(cv_fit) # plot lambda-MSE

plot(cv_fit$fit) # plot solution path

cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error
## [1] 160.8224
cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient
##        V1        V2        V3        V4        V5        V6        V7 
##  0.000000  0.000000 -1.014909  3.029420 -1.541143  4.098668 -4.438788 
##        V8        V9       V10       V11       V12       V13       V14 
##  3.277524 -7.166381  8.275885  0.000000  0.000000  0.000000  0.000000 
##       V15       V16       V17       V18       V19       V20 
##  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
## [1] 7.788304
sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error
## [1] 5.681397

CoCoLasso

We apply CoCoLasso to the problem.

# CoCoLasso
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1, 
                        foldid=foldid, direct_prediction=TRUE, 
                        positify="admm_max", weight_power = 0)
plot(cv_fit) # plot lambda-MSE

plot(cv_fit$fit) # plot solution path

cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error
## [1] 56.62824
cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient
##        V1        V2        V3        V4        V5        V6        V7 
##  0.000000  0.000000 -1.093565  3.116700 -1.612793  4.151754 -4.508953 
##        V8        V9       V10       V11       V12       V13       V14 
##  3.328830 -7.223450  8.355743  0.000000  0.000000  0.000000  0.000000 
##       V15       V16       V17       V18       V19       V20 
##  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
## [1] 7.628155
sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error
## [1] 5.571061

HMLasso

We apply HMLasso to the problem.

# HMLasso
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1, 
                        foldid=foldid, direct_prediction=TRUE, 
                        positify="admm_frob", weight_power = 1)
plot(cv_fit) # plot lambda-MSE

plot(cv_fit$fit) # plot solution path

cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error
## [1] 52.7777
cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient
##          V1          V2          V3          V4          V5          V6 
##  0.00000000  0.01459514 -1.24321064  3.27888761 -1.74878873  4.24887374 
##          V7          V8          V9         V10         V11         V12 
## -4.64075028  3.42379387 -7.33257437  8.50260041  0.00000000  0.00000000 
##         V13         V14         V15         V16         V17         V18 
##  0.00000000  0.00000000  0.00000000  0.00000000  0.00000000  0.00000000 
##         V19         V20 
##  0.00000000  0.00000000
pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
## [1] 7.329065
sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error
## [1] 5.36526