## Main function for Sparse Tensor Outcome REgression (STORE) by Sun and Li (2017, JMLR) ## Maintenance: Will Wei Sun ## Last edition: Sep 1, 2017 library(igraph) # in case there is an error in "library(remMap)": install.packages("~/Research/STOR/remMap_0.2-0.tar.gz") library(remMap) # in case there is an error in "library(caret)": install.packages("~/pbkrtest_0.4-5.tar.gz") library(caret) library(rTensor) ##################### ## Utility Functions ##################### EPS = 1e-4 rotate <- function(x) t(apply(x, 2, rev)) fnorm <- function(T){ ### Tensor Frobenius Norm \| T \|_F ### return(sqrt(sum(T^2))) } myadj = function(A){ #calculate the adjacent matrix of a matrix A connect.mat = A p = ncol(A) for(i in 1:p){ for(j in 1: p){ connect.mat[i,j] = ifelse(abs(A[i,j])>1e-5, 1, 0) } } connect.mat } mysd = function(x){ if(norm(x,"2") == 0){ return(x) }else{ return(x/sqrt(t(x)%*%x)) } } mynorm = function(x){ (x-mean(x))/sd(x) } mycenter = function(x){ x - mean(x) } mytensorcenter = function(y){ # y is a list of tensors n = length(y) y.center = Reduce("+", y) / length(y) y.new = y for(i in 1:n){ y.new[[i]] = y[[i]] - y.center } y.new } mytruncate = function(x,s){ xtruncate = rep(0, length(x)) xtruncate[ which(rank(-abs(x))<= s)] = x[which(rank(-abs(x))<= s)] xtruncate } myerrorlist = function(beta.old, beta.new){ nlist = length(beta.old) error = 0 for(i in 1:nlist){ error = error + min(fnorm(beta.old[[i]] - beta.new[[i]]), fnorm(beta.old[[i]] + beta.new[[i]])) } error/nlist } Ta = function(T, a){ # For a tensor T \in R^{d_1, d_2, d_3, p} and a vector a \in R^p, compute mode-4 tensor vector product T \times_{4} a. DD = dim(T) a = as.numeric(a) p = length(a) if(DD[length(DD)] != p){ error("Dimensions of T and a do not fit") }else{ tmp = array(0,DD[1:(length(DD) - 1)]) for(i in 1:p){ if(length(DD) == 3){ tmp = tmp + T[,,i] * a[i] }else if(length(DD) == 4){ tmp = tmp + T[,,,i] * a[i] } } } tmp } TA = function(T, A){ # For a tensor T \in R^{d_1, d_2, d_3, p} and a matrix A \in R^{n * p} , compute mode-4 tensor vector product T \times_{4} A_i for each i = 1, ...,n n = nrow(A) out = list() for(i in 1:n){ out[[i]] = Ta(T,A[i,]) } out } Tabc = function(T, a, b, c){ #compute tensor vector product T(a,b,c) for tensor T and vectors, a,b,c with different dimensions. d1 = dim(T)[1] d2 = dim(T)[2] d3 = dim(T)[3] tmp = 0 for(i in 1:d1){ for(j in 1:d2){ for(k in 1:d3){ tmp = tmp + a[i]*b[j]*c[k]*T[i,j,k] } } } tmp } lasso_operator <- function(x, lambda){ ### Soft-thresholding operator ### k = length(x) temp = rep(0, k) for(i in 1:k){ temp[i] = sign(x[i]) * (max(abs(x[i]) - lambda, 0)) } return(temp) } listnorm <- function(X,Y){ ### List Frobenius Norm n^{-1}\sum_i^n \| X[[i]] - Y[[i]] \|_F ### n = length(Y) if(length(X) !=n){ error("sizes of X and Y do not match!") }else{ value = rep(0, n) for(i in 1:n){ value[i] = fnorm(X[[i]] - Y[[i]]) } } mean(value) } sumnorm <- function(T, A){ ### Calculate sum(T * A) with * the Hadamard product (element-wise product) ### DD = dim(T) m = length(DD) temp = 0 if(m == 2){ for(i in 1 : DD[1]){ for(j in 1 : DD[2]){ temp = temp + T[i, j] * A[i, j] } } }else if(m == 3){ for(i in 1 : DD[1]){ for(j in 1 : DD[2]){ for(k in 1 : DD[3]){ temp = temp + T[i, j, k] * A[i, j, k] } } } } return(temp) } dist_sign = function(a.est, a){ #compute distance min{ \|a.est - a\|, \|a.est + a\| } a.est = as.matrix(a.est) a = as.matrix(a) if(ncol(a.est) == 1){ out = min(norm(a - a.est,type="2"), norm(a + a.est,type="2")) }else{ out = min(norm(a - a.est,type="F"), norm(a + a.est,type="F")) } out } mytprfpr = function(Beta.hat, Beta.true, is.symmetric = FALSE){ #compute the variable selection error of our method. # Beta.hat, Beta.true: list of {\Beta_1, ..., \Beta_m, \Beta_{m+1}} with \Beta_j \in R^{K \times d_j} K = dim(Beta.true[[1]])[1] m = length(Beta.true) - 1 if(m == 2 && is.symmetric == TRUE){ # symmetric matrix case in simulation 3 A.est = t(Beta.hat[[1]]) A = t(Beta.true[[1]]) ERROR = matrix(NA,K,K) j.best = rep(0,K) for(i in 1:K){ for(j in 1:K){ ERROR[i,j] = min(norm(A.est[,i] - A[,j],type="2"),norm(A.est[,i] + A[,j],type="2")) } j.best[i] = which.min(ERROR[i,]) } TPR.vec = rep(0,K) FPR.vec = rep(0,K) IND_hat = Beta.hat[[1]] IND_true = Beta.true[[1]] IND_hat[which(Beta.hat[[1]]!=0)] = 1 IND_true[which(Beta.true[[1]]!=0)] = 1 for(i in 1:K){ if(length(which(Beta.hat[[1]] == 0)) == 0){ TPR.vec[i] = 1 FPR.vec[i] = 1 }else{ TPR.vec[i] = confusionMatrix(IND_hat[i,],IND_true[j.best[i],],positive="1")$byClass[1] FPR.vec[i] = 1 - confusionMatrix(IND_hat[i,],IND_true[j.best[i],],positive="1")$byClass[2] } } }else{ A.est = t(Beta.hat[[1]]) B.est = t(Beta.hat[[2]]) C.est = t(Beta.hat[[3]]) A = t(Beta.true[[1]]) B = t(Beta.true[[2]]) C = t(Beta.true[[3]]) ## find the corresponding index in Beta.hat and Beta.true ERROR = matrix(NA,K,K) j.best = rep(0,K) for(i in 1:K){ for(j in 1:K){ ERROR[i,j] = min(norm(A.est[,i] - A[,j],type="2"),norm(A.est[,i] + A[,j],type="2")) } j.best[i] = which.min(ERROR[i,]) } ## variable selection error IA = A; IB = B; IC = C IA[which(A!=0)] = 1 IB[which(B!=0)] = 1 IC[which(C!=0)] = 1 IA.est = A.est; IB.est = B.est; IC.est = C.est IA.est[which(IA.est != 0)] = 1 IB.est[which(IB.est != 0)] = 1 IC.est[which(IC.est != 0)] = 1 TPR.vec = rep(0,K) FPR.vec = rep(0,K) for(ii in 1:K){ if(length(unique(IA.est[,ii]))==1 && IA.est[1,ii]==1){ tpr.a = 1 fpr.a = 1 }else if(length(unique(IA.est[,ii]))==1 && IA.est[1,ii]==0){ tpr.a = 0 fpr.a = 0 }else{ tpr.a = confusionMatrix(IA.est[,ii],IA[,j.best[ii]],positive="1")$byClass[1] fpr.a = 1 - confusionMatrix(IA.est[,ii],IA[,j.best[ii]],positive="1")$byClass[2] } if(length(unique(IB.est[,ii]))==1 && IB.est[1,ii]==1){ tpr.b = 1 fpr.b = 1 }else if(length(unique(IB.est[,ii]))==1 && IB.est[1,ii]==0){ tpr.b = 0 fpr.b = 0 }else{ tpr.b = confusionMatrix(IB.est[,ii],IB[,j.best[ii]],positive="1")$byClass[1] fpr.b = 1 - confusionMatrix(IB.est[,ii],IB[,j.best[ii]],positive="1")$byClass[2] } if(length(unique(IC.est[,ii]))==1 && IC.est[1,ii]==1){ tpr.c = 1 fpr.c = 1 }else if(length(unique(IC.est[,ii]))==1 && IC.est[1,ii]==0){ tpr.c = 0 fpr.c = 0 }else{ tpr.c = confusionMatrix(IC.est[,ii],IC[,j.best[ii]],positive="1")$byClass[1] fpr.c = 1 - confusionMatrix(IC.est[,ii],IC[,j.best[ii]],positive="1")$byClass[2] } TPR.vec[ii] = mean(c(tpr.a,tpr.b,tpr.c)) FPR.vec[ii] = mean(c(fpr.a,fpr.b,fpr.c)) } } c(mean(TPR.vec), mean(FPR.vec)) } mytprfpr_B = function(B.hat, B.true){ #compute the variable selection error of the whole tensor coefficient B. output (TPR, FPR, f1) # B.hat, B.true: \in R^{d_1 *...* d_{m+1}} m = length(dim(B.true)) - 1 p = dim(B.true)[m+1] alldim = prod(dim(B.true)) IND_hat = rep(0, alldim) IND_true = rep(0, alldim) IND_hat[which(B.hat!=0)] = 1 IND_true[which(B.true!=0)] = 1 if(length(which(B.hat == 0)) == 0){ ## all non-zeros TPR.vec = 1 FPR.vec = 1 precision.val = length(which(B.true!=0)) / alldim recall.val = 1 f1 = as.numeric(2 * precision.val * recall.val / (precision.val + recall.val) ) }else if(length(which(B.hat != 0)) == 0){ ## all zeros TPR.vec = 0 FPR.vec = 0 precision.val = length(which(B.true!=0)) / alldim recall.val = 0 f1 = as.numeric(2 * precision.val * recall.val / (precision.val + recall.val) ) }else{ TPR.vec = confusionMatrix(IND_hat,IND_true,positive="1")$byClass[1] FPR.vec = 1 - confusionMatrix(IND_hat,IND_true,positive="1")$byClass[2] confusion.table = confusionMatrix(IND_hat,IND_true,positive="1")$table TN = confusion.table[1,1] TP = confusion.table[2,2] FN = confusion.table[1,2] FP = confusion.table[2,1] if(TP == 0){ ## avoid NaN case in f1 score. TP = 0.1 } precision.val = TP / (TP + FP) recall.val = TP/ (TP + FN) f1 = as.numeric(2 * precision.val * recall.val / (precision.val + recall.val) ) } as.numeric(c(TPR.vec, FPR.vec, f1)) } ### lasso type rank-1 sparse tensor decomposition (Allen, 2012) ### stdlasso <- function(T, Para, niter = 20) { # T: a matrix or a three-order tensor # Para = c(lambda_1,..., lambda_m): number of non-zero slements in each columns of components # niter: number of iterations DD = dim(T) num_m = length(DD) set.seed(1) beta.new = list() for(j in 1:num_m){ beta.new[[j]] = rnorm(DD[j]) } t = 0 while(t < niter){ t = t + 1 beta.old = beta.new beta.tmp = list() for(j in 1:num_m){ beta.tmp[[j]] = rep(0, DD[j]) } if(num_m == 2){ ## beta_1 ## for (j in 1 : DD[2]) { beta.tmp[[1]] = beta.tmp[[1]] + beta.old[[2]][j] * T[, j] } beta.tmp[[1]] = beta.tmp[[1]] / norm(beta.tmp[[1]], type = "2") if (norm(lasso_operator(beta.tmp[[1]], Para[1]), type = "2") > 0) { beta.new[[1]] <- lasso_operator(beta.tmp[[1]], Para[1]) / norm(lasso_operator(beta.tmp[[1]], Para[1]), type = "2") }else{ beta.new[[1]] = rep(0, DD[1]) break } ## beta_2 ## for (i in 1 : DD[1]) { beta.tmp[[2]] = beta.tmp[[2]] + beta.old[[1]][i] * T[i, ] } beta.tmp[[2]] = beta.tmp[[2]] / norm(beta.tmp[[2]], type = "2") if (norm(lasso_operator(beta.tmp[[2]], Para[2]), type = "2") > 0) { beta.new[[2]] <- lasso_operator(beta.tmp[[2]], Para[2]) / norm(lasso_operator(beta.tmp[[2]], Para[2]), type = "2") }else{ beta.new[[2]] = rep(0, DD[2]) break } }else if(num_m == 3){ ## beta_1 ## for (j in 1 : DD[2]) { for (k in 1 : DD[3]) { beta.tmp[[1]] = beta.tmp[[1]] + beta.old[[2]][j] * beta.old[[3]][k] * T[, j, k] } } beta.tmp[[1]] = beta.tmp[[1]] / norm(beta.tmp[[1]], type = "2") if (norm(lasso_operator(beta.tmp[[1]], Para[1]), type = "2") > 0) { beta.new[[1]] <- lasso_operator(beta.tmp[[1]], Para[1]) / norm(lasso_operator(beta.tmp[[1]], Para[1]), type = "2") }else{ beta.new[[1]] = rep(0, DD[1]) break } ## beta_2 ## for (i in 1 : DD[1]) { for (k in 1 : DD[3]) { beta.tmp[[2]] = beta.tmp[[2]] + beta.old[[1]][i] * beta.old[[3]][k] * T[i, , k] } } beta.tmp[[2]] = beta.tmp[[2]] / norm(beta.tmp[[2]], type = "2") if (norm(lasso_operator(beta.tmp[[2]], Para[2]), type = "2") > 0) { beta.new[[2]] <- lasso_operator(beta.tmp[[2]], Para[2]) / norm(lasso_operator(beta.tmp[[2]], Para[2]), type = "2") }else{ beta.new[[2]] = rep(0, DD[2]) break } ## beta_3 ## for (i in 1 : DD[1]) { for (j in 1 : DD[2]) { beta.tmp[[3]] = beta.tmp[[3]] + beta.old[[1]][i] * beta.old[[2]][j] * T[i, j,] } } beta.tmp[[3]] = beta.tmp[[3]] / norm(beta.tmp[[3]], type = "2") if (norm(lasso_operator(beta.tmp[[3]], Para[3]), type = "2") > 0) { beta.new[[3]] <- lasso_operator(beta.tmp[[3]], Para[3]) / norm(lasso_operator(beta.tmp[[3]], Para[3]), type = "2") }else{ beta.new[[3]] = rep(0, DD[3]) break } } } for(j in 1:num_m){ beta.new[[j]] = as.matrix(beta.new[[j]]) } w = ifelse(num_m ==2, as.numeric(t(beta.new[[1]]) %*% T %*% beta.new[[2]]), Tabc(T, beta.new[[1]], beta.new[[2]], beta.new[[3]])) OUT = list() OUT$beta = beta.new OUT$w = w OUT$t = t return(OUT) } ### truncation type rank-1 sparse tensor decomposition (Sun et al., 2016) ### stdtruncate <- function(T, Para, is.symmetric = FALSE, real_3d = FALSE, niter = 20) { # T: a three-order tensor # Para = c(s1,...,sm): number of non-zero slements in each columns of components # niter: number of iterations DD = dim(T) num_m = length(DD) if(is.symmetric == TRUE){ ## special case with beta_1 = ... = beta_m if(length(unique(DD))>1){ error("Input tensor is not symmetric. Can not apply symmetric tensor decomposition!") break }else{ dd = DD[1] if(num_m == 2){ set.seed(2) a.tmp = svd(T)$u[,1] a.new = mysd(mytruncate(mysd(a.tmp),Para[1])) }else{ set.seed(2) a.new = rnorm(dd) a.old = rep(0,dd) TER.ERROR = NULL t = 0 while(norm(a.old - a.new,type="2") >= EPS && t < niter) { TER.ERROR = append(TER.ERROR, norm(a.old - a.new,type="2")) t = t + 1 a.old = a.new a.tmp = rep(0, dd) for (j in 1 : dd) { for (k in 1 : dd) { a.tmp = a.tmp + a.old[j] * a.old[k] * T[, j, k] } } a.new = mysd(mytruncate(mysd(a.tmp),Para[1])) } } beta.new = list() for(j in 1:num_m){ beta.new[[j]] = as.matrix(a.new) } } }else{ set.seed(1) beta.new = list() beta.old = list() for(j in 1:num_m){ beta.new[[j]] = rnorm(DD[j]) beta.old[[j]] = rep(0, DD[j]) } t = 0 w.tmp = NULL TER.ERROR = NULL while(myerrorlist(beta.old, beta.new) >= EPS && t < niter) { TER.ERROR = append(TER.ERROR, myerrorlist(beta.old, beta.new)) t = t + 1 beta.old = beta.new beta.tmp = list() for(j in 1:num_m){ beta.tmp[[j]] = rep(0, DD[j]) } if(num_m == 2){ ## beta_1 ## for (j in 1 : DD[2]) { beta.tmp[[1]] = beta.tmp[[1]] + beta.old[[2]][j] * T[, j] } beta.new[[1]] = mysd(mytruncate(mysd(beta.tmp[[1]]),Para[1])) ## beta_2 ## for (i in 1 : DD[1]) { beta.tmp[[2]] = beta.tmp[[2]] + beta.old[[1]][i] * T[i, ] } beta.new[[2]] = mysd(mytruncate(mysd(beta.tmp[[2]]),Para[2])) }else if(num_m == 3){ ## beta_1 ## for (j in 1 : DD[2]) { for (k in 1 : DD[3]) { beta.tmp[[1]] = beta.tmp[[1]] + beta.old[[2]][j] * beta.old[[3]][k] * T[, j, k] } } beta.new[[1]] = mysd(mytruncate(mysd(beta.tmp[[1]]),Para[1])) ## beta_2 ## for (i in 1 : DD[1]) { for (k in 1 : DD[3]) { if(real_3d == TRUE){ beta.tmp[[2]] = beta.tmp[[2]] + beta.new[[1]][i] * beta.old[[3]][k] * T[i, , k] }else{ beta.tmp[[2]] = beta.tmp[[2]] + beta.old[[1]][i] * beta.old[[3]][k] * T[i, , k] } } } beta.new[[2]] = mysd(mytruncate(mysd(beta.tmp[[2]]),Para[2])) ## beta_3 ## for (i in 1 : DD[1]) { for (j in 1 : DD[2]) { if(real_3d == TRUE){ beta.tmp[[3]] = beta.tmp[[3]] + beta.new[[1]][i] * beta.new[[2]][j] * T[i, j,] }else{ beta.tmp[[3]] = beta.tmp[[3]] + beta.old[[1]][i] * beta.old[[2]][j] * T[i, j,] } } } beta.new[[3]] = mysd(mytruncate(mysd(beta.tmp[[3]]),Para[3])) } #debug w.tmp = append(w.tmp, ifelse(num_m ==2, as.numeric(t(as.matrix(beta.new[[1]])) %*% T %*% as.matrix(beta.new[[2]])), Tabc(T, as.matrix(beta.new[[1]]), as.matrix(beta.new[[2]]), as.matrix(beta.new[[3]])))) } for(j in 1:num_m){ beta.new[[j]] = as.matrix(beta.new[[j]]) } } w = ifelse(num_m ==2, as.numeric(t(beta.new[[1]]) %*% T %*% beta.new[[2]]), Tabc(T, beta.new[[1]], beta.new[[2]], beta.new[[3]])) OUT = list() OUT$beta = beta.new OUT$w = w return(OUT) } myobj = function(Y, X, W, B){ # objective function to be minimized # Y: a list of response tensor Y = {Y_1, ..., Y_n} wit Y[[i]] = Y_i a tensor of dimension d1 * d2 * d3 # X \in R^{n * p}: covariate X = (x_1, ..., x_n)^T with x_i a vector of dimension p # W: a K-dim vector of weights W = (w_1, ..., w_K) # B: a list of parameters B = {B_1, ..., B_m, B_{m+1}} wit B[[j]] = B_j \in R^{K * d_j} for j =1,...,m+1. here d_{m+1} = p. n = nrow(X) K = length(W) m = length(B) - 1 A = list() for(k in 1:K){ A[[k]] = B[[1]][k,] for(j in 2:m){ A[[k]] = outer(A[[k]], B[[j]][k,]) } A[[k]] = W[k] * A[[k]] } obj = 0 for(i in 1:n){ Diff = array(0, dim(Y[[1]])) for(k in 1:K){ Diff = Diff + A[[k]] * as.numeric(t(B[[m+1]][k,]) %*% X[i,]) } obj = obj + fnorm( Y[[i]] - Diff)^2 } obj = obj/n obj } ##################### ## Main Algorithms ##################### mystor <- function(Y, X, Para, K, stdmethod = "truncate", is.symmetric = FALSE, niter = 20, allow.restart = FALSE, real_3d = FALSE, is.center = FALSE) { # Y: a list of response tensor Y = {Y_1, ..., Y_n} wit Y[[i]] = Y_i a tensor of dimension d1 * d2 * d3 # X \in R^{n * p}: covariate X = (x_1, ..., x_n)^T with x_i a vector of dimension p # Para = (lambda_1, ..., lambda_m) or (s_1, ..., s_m): tuning parameters for components A, B, C, ... # K: Rank # niter: number of iterations m = length(dim(Y[[1]])) X = as.matrix(X) n = nrow(X) p = ncol(X) DD = c(lapply(Y,dim)[[1]], p) # DD = (d_1, ..., d_m, p) if(is.center == TRUE){ X = apply(X,2,mycenter) Y = mytensorcenter(Y) } if(allow.restart == FALSE){ i.restart = 0 Beta = list() for(j in 1: length(DD)){ Beta[[j]] = matrix(0, K, DD[j]) for(k in 1:K){ set.seed(j+10*k+(i.restart-1)*100) Beta[[j]][k,] = mysd(rnorm(DD[j])) } } W = rep(1, K) ## Step 2: Alternative update t = 0 error = 1 TRUE.ERROR = NULL TERMINATE.ERROR = NULL OBJ = NULL W.mat = W while(error > EPS && t <= niter){ W.old = W Beta.old = Beta t = t + 1 ## step 2_1: update beta_k^{(1)}, ..., beta_k^{(m+1)}; w_k for k =1,..., K. tmp = list() for(k in 1:K){ tmp[[k]] = Beta[[1]][k,] for(j in 2:m){ tmp[[k]] = outer(tmp[[k]], Beta[[j]][k,]) } tmp[[k]] = W[k] * tmp[[k]] } alpha = matrix(0, n, K) for(k in 1:K){ for(i in 1:n){ alpha[i,k] = as.numeric(t(Beta[[m+1]][k,]) %*% X[i,]) } } for(k in 1:K){ R = list() for(i in 1:n){ Diff = array(0, dim(Y[[1]])) for(k_prime in 1:K){ if(k_prime != k & alpha[i,k_prime] != 0){ Diff = Diff + tmp[[k_prime]] * alpha[i,k_prime] } } if(alpha[i,k]!= 0){ R[[i]] = (Y[[i]] - Diff) / alpha[i,k] }else{ R[[i]] = array(0, dim(Y[[i]])) } } R_bar = array(0, dim(Y[[1]])) index = which(alpha[,k] !=0) for(j in 1:length(index)){ R_bar = R_bar + alpha[index[j], k]^2 * R[[index[j]]] } R_bar = R_bar/length(index) if(stdmethod == "lasso"){ # if use lasso penalized sparse tensor decomposition OUT_std = stdlasso(R_bar, Para) }else{ # if use truncated sparse tensor decomposition OUT_std = stdtruncate(R_bar, Para, is.symmetric,real_3d) } if(is.center == TRUE){ W[k] = OUT_std$w / mean(alpha[,k]^2) }else{ W[k] = OUT_std$w } for(j in 1:m){ Beta[[j]][k,] = OUT_std$beta[[j]] } tmp[[k]] = Beta[[1]][k,] for(j in 2:m){ tmp[[k]] = outer(tmp[[k]], Beta[[j]][k,]) } tmp[[k]] = W[k] * tmp[[k]] } ## step 2_2: update beta_k^{(m+1)}, for k = 1, ..., K A = list() for(k in 1:K){ A[[k]] = Beta[[1]][k,] for(j in 2:m){ A[[k]] = outer(A[[k]], Beta[[j]][k,]) } A[[k]] = W[k] * A[[k]] } for(k in 1:K){ Omega = solve(n^{-1} * t(X) %*% X) T = list() numerator = rep(0, p) for(i in 1:n){ Diff = array(0, dim(Y[[1]])) for(k_prime in 1:K){ if(k_prime != k){ Diff = Diff + A[[k_prime]] * as.numeric(t(Beta[[m+1]][k_prime,]) %*% X[i,]) } } T[[i]] = Y[[i]] - Diff numerator = numerator + sumnorm(T[[i]], A[[k]]) * X[i,] } numerator = numerator / n Beta[[m+1]][k,] = Omega %*% numerator / (fnorm(A[[k]]))^2 Beta[[m+1]][k,] = Beta[[m+1]][k,] / norm(Beta[[m+1]][k,], type="2") } # step 2_3: update error \|B.hat - B.hat.old\|_F B.hat = array(0, DD) for(k in 1:K){ if(m == 2){ B.hat = B.hat + W[k] * outer(outer(Beta[[1]][k,],Beta[[2]][k,]),Beta[[3]][k,]) }else if(m ==3){ B.hat = B.hat + W[k] * outer(outer(outer(Beta[[1]][k,],Beta[[2]][k,]),Beta[[3]][k,]), Beta[[4]][k,]) } } B.hat.old = array(0, DD) for(k in 1:K){ if(m == 2){ B.hat.old = B.hat.old + W.old[k] * outer(outer(Beta.old[[1]][k,],Beta.old[[2]][k,]),Beta.old[[3]][k,]) }else if(m ==3){ B.hat.old = B.hat.old + W.old[k] * outer(outer(outer(Beta.old[[1]][k,],Beta.old[[2]][k,]),Beta.old[[3]][k,]), Beta.old[[4]][k,]) } } error = fnorm(B.hat - B.hat.old)/fnorm(B.hat) TERMINATE.ERROR = append(TERMINATE.ERROR, error) OBJ = append(OBJ, myobj(Y,X,W,Beta)) W.mat = cbind(W.mat,W) if(myobj(Y,X,W,Beta) > 1e+50 || sum(abs(W)) > 1e+50){break} ## avoid the diverging case to caused error } }else{ is.restart = TRUE i.restart = 0 TIME = NULL while(is.restart == TRUE && i.restart < 2){ t1 = proc.time() i.restart = i.restart +1 # Step 1: Initialization Beta = list() for(j in 1: length(DD)){ Beta[[j]] = matrix(0, K, DD[j]) for(k in 1:K){ set.seed(j+10*k+(i.restart-1)*100) Beta[[j]][k,] = mysd(rnorm(DD[j])) } } W = rep(1, K) ## Step 2: Alternative update t = 0 error = 1 TRUE.ERROR = NULL TERMINATE.ERROR = NULL OBJ = NULL W.mat = W while(error > EPS && t <= niter){ W.old = W Beta.old = Beta t = t + 1 ## step 2_1: update beta_k^{(1)}, ..., beta_k^{(m+1)}; w_k for k =1,..., K. tmp = list() for(k in 1:K){ tmp[[k]] = Beta[[1]][k,] for(j in 2:m){ tmp[[k]] = outer(tmp[[k]], Beta[[j]][k,]) } tmp[[k]] = W[k] * tmp[[k]] } alpha = matrix(0, n, K) for(k in 1:K){ for(i in 1:n){ alpha[i,k] = as.numeric(t(Beta[[m+1]][k,]) %*% X[i,]) } } for(k in 1:K){ R = list() for(i in 1:n){ Diff = array(0, dim(Y[[1]])) for(k_prime in 1:K){ if(k_prime != k & alpha[i,k_prime] != 0){ Diff = Diff + tmp[[k_prime]] * alpha[i,k_prime] } } if(alpha[i,k]!= 0){ R[[i]] = (Y[[i]] - Diff) / alpha[i,k] }else{ R[[i]] = array(0, dim(Y[[i]])) } } R_bar = array(0, dim(Y[[1]])) index = which(alpha[,k] !=0) for(j in 1:length(index)){ R_bar = R_bar + alpha[index[j], k]^2 * R[[index[j]]] } R_bar = R_bar/length(index) if(stdmethod == "lasso"){ # if use lasso penalized sparse tensor decomposition OUT_std = stdlasso(R_bar, Para) }else{ # if use truncated sparse tensor decomposition OUT_std = stdtruncate(R_bar, Para, is.symmetric,real_3d) } if(is.center == TRUE){ W[k] = OUT_std$w / mean(alpha[,k]^2) }else{ W[k] = OUT_std$w } for(j in 1:m){ Beta[[j]][k,] = OUT_std$beta[[j]] } tmp[[k]] = Beta[[1]][k,] for(j in 2:m){ tmp[[k]] = outer(tmp[[k]], Beta[[j]][k,]) } tmp[[k]] = W[k] * tmp[[k]] } ## to avoid the case with zero weights, caused error in step 2_2. if(min(abs(W)) == 0){ error = -1000 is.restart = TRUE next } ## step 2_2: update beta_k^{(m+1)}, for k = 1, ..., K A = list() for(k in 1:K){ A[[k]] = Beta[[1]][k,] for(j in 2:m){ A[[k]] = outer(A[[k]], Beta[[j]][k,]) } A[[k]] = W[k] * A[[k]] } for(k in 1:K){ Omega = solve(n^{-1} * t(X) %*% X) T = list() numerator = rep(0, p) for(i in 1:n){ Diff = array(0, dim(Y[[1]])) for(k_prime in 1:K){ if(k_prime != k){ Diff = Diff + A[[k_prime]] * as.numeric(t(Beta[[m+1]][k_prime,]) %*% X[i,]) } } T[[i]] = Y[[i]] - Diff numerator = numerator + sumnorm(T[[i]], A[[k]]) * X[i,] } numerator = numerator / n Beta[[m+1]][k,] = Omega %*% numerator / (fnorm(A[[k]]))^2 Beta[[m+1]][k,] = Beta[[m+1]][k,] / norm(Beta[[m+1]][k,], type="2") } # step 2_3: update error \|B.hat - B.hat.old\|_F B.hat = array(0, DD) for(k in 1:K){ if(m == 2){ B.hat = B.hat + W[k] * outer(outer(Beta[[1]][k,],Beta[[2]][k,]),Beta[[3]][k,]) }else if(m ==3){ B.hat = B.hat + W[k] * outer(outer(outer(Beta[[1]][k,],Beta[[2]][k,]),Beta[[3]][k,]), Beta[[4]][k,]) } } B.hat.old = array(0, DD) for(k in 1:K){ if(m == 2){ B.hat.old = B.hat.old + W.old[k] * outer(outer(Beta.old[[1]][k,],Beta.old[[2]][k,]),Beta.old[[3]][k,]) }else if(m ==3){ B.hat.old = B.hat.old + W.old[k] * outer(outer(outer(Beta.old[[1]][k,],Beta.old[[2]][k,]),Beta.old[[3]][k,]), Beta.old[[4]][k,]) } } error = fnorm(B.hat - B.hat.old)/fnorm(B.hat) TERMINATE.ERROR = append(TERMINATE.ERROR, error) OBJ = append(OBJ, myobj(Y,X,W,Beta)) W.mat = cbind(W.mat,W) } # Compute correlation among the components. if correlation is very high, has risky of local minimal, restart the initialization! cor.vec = NULL for(ii in 1:m){ AA = Beta[[ii]] if(K>=2){ for(k in 1:(K-1)){ cor.vec = append(cor.vec, cor(AA[k,],AA[k+1,])) } }else{ cor.vec = 0 } } if(max(abs(cor.vec))> 0.95){ is.restart = TRUE }else if(error == -1000){ is.restart = TRUE }else{ is.restart = FALSE } TIME = append(TIME, (proc.time() - t1)[3]) } } # Compute BIC for model selection: # bic1 = -2 * log(Likelihood) + log(n) * df # bic2 = log(SSE) + log(n*d1*d2*d3)/(n*d1*d2*d3) * df A_final = list() mydf = 0 for(k in 1:K){ A_final[[k]] = Beta[[1]][k,] mydf = sum(Beta[[1]]!=0) for(j in 2:m){ A_final[[k]] = outer(A_final[[k]], Beta[[j]][k,]) mydf = mydf + sum(Beta[[j]]!=0) } A_final[[k]] = W[k] * A_final[[k]] } fitting_error = 0 residual = list() for(i in 1:n){ Diff = array(0, dim(Y[[1]])) for(k in 1:K){ Diff = Diff + A_final[[k]] * as.numeric(t(Beta[[m+1]][k,]) %*% X[i,]) } residual[[i]] = Y[[i]] - Diff fitting_error = fitting_error + fnorm(residual[[i]])^2 } fitting_error = 2 * fitting_error/n bic = fitting_error + log(n) * mydf bic2 = log(n * fitting_error / 2) + log(n * prod(DD[1:m]) )/(n * prod(DD[1:m])) * mydf OUT = list() OUT$B.hat = B.hat OUT$Beta.hat = Beta OUT$W.hat = W OUT$TERMINATE.ERROR = TERMINATE.ERROR OUT$OBJ = OBJ OUT$num.iter = t OUT$bic = bic OUT$bic2 = bic2 OUT$fitting_error = fitting_error OUT$df = mydf OUT$i.restart = i.restart return(OUT) } ########################################################################### ## Vectorized OLS: fit each element of Y on X one-at-a-time ########################################################################### myols <- function(Y, X, is.symmetric = FALSE) { # Y: a list of response tensor Y = {Y_1, ..., Y_n} wit Y[[i]] = Y_i a tensor of dimension d1 * d2 * d3 # X \in R^{n * p}: covariate X = (x_1, ..., x_n)^T with x_i a vector of dimension p m = length(dim(Y[[1]])) X = as.matrix(X) n = nrow(X) p = ncol(X) DD = c(lapply(Y,dim)[[1]], p) # DD = (d_1, ..., d_m, p) Z = solve(t(X) %*% X) %*% t(X) B.hat = array(0, DD) if(m == 2){ #matrix outcome for(i1 in 1:DD[1]){ for(i2 in 1:DD[2]){ for(j in 1:DD[3]){ for(i in 1:n){ B.hat[i1,i2,j] = B.hat[i1,i2,j] + Y[[i]][i1,i2] * Z[j,i] } } } } }else if(m == 3){ # 3rd order tensor outcome for(i1 in 1:DD[1]){ for(i2 in 1:DD[2]){ for(i3 in 1:DD[3]){ for(j in 1:DD[4]){ for(i in 1:n){ B.hat[i1,i2,i3,j] = B.hat[i1,i2,i3,j] + Y[[i]][i1,i2,i3] * Z[j,i] } } } } } } if(is.symmetric == TRUE && m==2){ ## in matrix symmetric output case, use B.hat.new = [B.hat + t(B.hta)]/2 B.hat.new = B.hat for(jj in 1:p){ B.hat.new[,,jj] = (B.hat[,,jj] + t(B.hat[,,jj]))/2 } B.hat = B.hat.new } B.hat } ########################################################################################################################## ## Deprecated: Vectorized OLS version2: fit each element of Y on X one-at-a-time. Same output as myols. ########################################################################################################################### myols2 <- function(Y, X) { # Y: a list of response tensor Y = {Y_1, ..., Y_n} wit Y[[i]] = Y_i a tensor of dimension d1 * d2 * d3 # X \in R^{n * p}: covariate X = (x_1, ..., x_n)^T with x_i a vector of dimension p m = length(dim(Y[[1]])) X = as.matrix(X) n = nrow(X) p = ncol(X) DD = c(lapply(Y,dim)[[1]], p) # DD = (d_1, ..., d_m, p) B.hat = array(0, DD) ## stack Y into a big array Y.stack = array(0, c(lapply(Y,dim)[[1]], n)) for(i in 1:n){ if(m == 2){ Y.stack[,,i] = Y[[i]] }else if(m == 3){ Y.stack[,,,i] = Y[[i]] } } ## fit one entry of response tensor at a time: lm() if(m == 2){ #matrix outcome for(i1 in 1:DD[1]){ for(i2 in 1:DD[2]){ y.vec = Y.stack[i1,i2,] data.lm = data.frame(cbind(y.vec, X)) out.lm = lm(y.vec ~ 0+. , data = data.lm) B.hat[i1,i2,] = as.numeric(out.lm$coefficients) } } }else if(m == 3){ # 3rd order tensor outcome for(i1 in 1:DD[1]){ for(i2 in 1:DD[2]){ for(i3 in 1:DD[3]){ y.vec = Y.stack[i1,i2,i3,] data.lm = data.frame(cbind(y.vec, X)) out.lm = lm(y.vec ~ 0+. , data = data.lm) B.hat[i1,i2,i3,] = as.numeric(out.lm$coefficients) } } } } B.hat } ############################################################################################# ## Sparse OLS via hard thresholding: fit each element of Y on X one-at-a-time ############################################################################################# mytruncateols <- function(Y, X, gamma) { # Y: a list of response tensor Y = {Y_1, ..., Y_n} wit Y[[i]] = Y_i a tensor of dimension d1 * d2 * d3 # X \in R^{n * p}: covariate X = (x_1, ..., x_n)^T with x_i a vector of dimension p # gamma: tuning parameter in the truncation step. m = length(dim(Y[[1]])) X = as.matrix(X) n = nrow(X) p = ncol(X) DD = c(lapply(Y,dim)[[1]], p) # DD = (d_1, ..., d_m, p) Z = solve(t(X) %*% X) %*% t(X) B.hat = array(0, DD) if(m == 2){ #matrix outcome for(i1 in 1:DD[1]){ for(i2 in 1:DD[2]){ for(j in 1:DD[3]){ for(i in 1:n){ B.hat[i1,i2,j] = B.hat[i1,i2,j] + Y[[i]][i1,i2] * Z[j,i] } } } } }else if(m == 3){ # 3rd order tensor outcome for(i1 in 1:DD[1]){ for(i2 in 1:DD[2]){ for(i3 in 1:DD[3]){ for(j in 1:DD[4]){ for(i in 1:n){ B.hat[i1,i2,i3,j] = B.hat[i1,i2,i3,j] + Y[[i]][i1,i2,i3] * Z[j,i] } } } } } } B.hat[which(abs(B.hat) <= gamma)] = 0 B.hat } ############################################################################################################################# ## Sparse OLS via lasso: implement via the remMap R package with automatically BIC tuning by Peng et al. (2010) ## \min_{B} \| Y - B^{\top} X \|_F^2 + \lambda \|B\|_1, with Y \in R^{n * d1d2d3}, X \in R^{n * p}, B \in R^{p * d1d2d3} ############################################################################################################################# mysparseols <- function(Y, X, lam.vec = exp(seq(-3,3, length=10)), tuning = "BIC", is.symmetric = FALSE) { # Y: a list of response tensor Y = {Y_1, ..., Y_n} wit Y[[i]] = Y_i a tensor of dimension d1 * d2 * d3 # X \in R^{n * p}: covariate X = (x_1, ..., x_n)^T with x_i a vector of dimension p # lam.vec: range of tuning parameters for lasso. # tuning: \in {"BIC", "CV"}: one of these two tuning methods m = length(dim(Y[[1]])) X = as.matrix(X) n = nrow(X) p = ncol(X) DD = c(lapply(Y,dim)[[1]], p) # DD = (d_1, ..., d_m, p) B.hat = array(0, DD) ## stack Y into a big array Y.mat = matrix(0, n, prod(lapply(Y,dim)[[1]])) for(i in 1:n){ Y.mat[i,] = as.numeric(Y[[i]]) } ## fit sparse multiple response regression via remMap. only tune lamL1 for lasso penalty. set the group penalty lamL2 = 0. lamL1.v = sort(lam.vec, decreasing = TRUE) lamL2.v = 0 if(tuning == "BIC"){ out.bic = remMap.BIC(X, Y.mat, lamL1.v, lamL2.v) if(min(out.bic$BIC) == -Inf && max(out.bic$BIC) == -Inf){ lambda.opt = mean(lamL1.v) }else{ lambda.opt = lamL1.v[which.min(out.bic$BIC)[1]] } out.bic.opt = remMap(X, Y.mat, lamL1 = lambda.opt, lamL2 = 0) B.mat.opt = out.bic.opt$phi }else if(tuning == "CV"){ if(ncol(X) == 1){ # error in remMap.cv. add a redundant column to X. Xnew = cbind(X,rep(1,nrow(X))) out.cv = remMap.CV(Xnew, Y.mat, lamL1.v, lamL2.v, fold = 5) lam.opt = as.numeric(out.cv$l.index[1,which.min(out.cv$ols.cv)[1]]) out.cv.opt = remMap(Xnew, Y.mat, lamL1 = lam.opt, lamL2 = 0) B.mat.opt = matrix(out.cv.opt$phi[1,],p, ncol(Y.mat)) }else{ out.cv = remMap.CV(X, Y.mat, lamL1.v, lamL2.v, fold = 5) lam.opt = as.numeric(out.cv$l.index[1,which.min(out.cv$ols.cv)[1]]) out.cv.opt = remMap(X, Y.mat, lamL1 = lam.opt, lamL2 = 0) B.mat.opt = out.cv.opt$phi } } for(i in 1:p){ if(m==2){ B.hat[,,i] = array(B.mat.opt[i,], lapply(Y,dim)[[1]]) }else if(m==3){ B.hat[,,,i] = array(B.mat.opt[i,], lapply(Y,dim)[[1]]) } } if(is.symmetric == TRUE && m==2){ ## in matrix symmetric output case, use B.hat.new = [B.hat + t(B.hta)]/2 B.hat.new = B.hat for(jj in 1:p){ B.hat.new[,,jj] = (B.hat[,,jj] + t(B.hat[,,jj]))/2 } B.hat = B.hat.new } B.hat } ####################################################################################################################################### ## Higher-order low-rank regression (HOLLR) by Rabusseau and Kadri (2016, NIPS) ## \min_{\cW} { \| \cW \times_1 X - \cY \|_F^2 + \gamma \|\cW\|_F^2 } s.t. rank (\cW) <= (R0, R1, ..., Rm) ## Here \cY \in R^{n \times d1 \times d2 \times ... \times dm\}, X \in R^{n * p}, \cW \in R^{p \times d1 \times d2 \times ... \times dm \} ####################################################################################################################################### myholrr <- function(Y, X, R.vec = NULL, gamma = NULL, is.symmetric = FALSE) { # Y: a list of response tensor Y = {Y_1, ..., Y_n} wit Y[[i]] = Y_i a tensor of dimension d1 * ... * dm # X \in R^{n * p}: covariate X = (x_1, ..., x_n)^T with x_i a vector of dimension p # R.vec = (R0, R1, ..., Rm) \in R^{m+1} is the input rank for each mode of \cW. for fair comparison with ours, use same rank in all modes. # gamma: tuning parameter in ridge regression constraint. if(is.null(R.vec) | is.null(gamma)){ # tuning via CV out.cv = myholrrcv(Y, X, is.symmetric = is.symmetric) R.vec = out.cv$opt.R.vec gamma = out.cv$opt.gamma } m = length(dim(Y[[1]])) X = as.matrix(X) n = nrow(X) p = ncol(X) DD = c(lapply(Y,dim)[[1]], p) # DD = (d_1, ..., d_m, p) B.hat = array(0, DD) ## stack Y into a big array Y.tensor = array(0, c(n, lapply(Y,dim)[[1]])) for(i in 1:n){ if(m == 2){ Y.tensor[i,,] = Y[[i]] }else if(m == 3){ Y.tensor[i,,,] = Y[[i]] } } Y.tnsr = as.tensor(Y.tensor) ## compute U0, ... Um Y.mat = k_unfold(Y.tnsr, 1)@data vec.tmp = t(X) %*% Y.mat mat.tmp = solve(t(X) %*% X + gamma * diag(p)) %*% vec.tmp %*% t(vec.tmp) U0 = as.matrix(svd(mat.tmp)$u[,1:R.vec[1]]) U.all = list() for(i in 1:m){ Y.mat2 = k_unfold(Y.tnsr, i+1)@data mat.tmp2 = Y.mat2 %*% t(Y.mat2) U.all[[i]] = as.matrix(svd(mat.tmp2)$u[,1:R.vec[i+1]]) } ## compute \cG M.mat = solve( t(U0) %*% (t(X) %*% X + gamma * diag(p)) %*% U0 ) %*% t(U0) %*% t(X) G.tnsr = ttm(Y.tnsr, M.mat, 1) for(i in 1:m){ G.tnsr = ttm(G.tnsr, t(U.all[[i]]), i+1) } ## compute coefficient \cW W.tnsr = ttm(G.tnsr, U0, 1) for(i in 1:m){ W.tnsr = ttm(W.tnsr, U.all[[i]], i+1) } W.tensor = W.tnsr@data ## convert \cW to the final coefficient \cB \in R^{p \times d1 \times d2 \times ... \times dm \} for(i in 1:p){ if(m == 2){ B.hat[,,i] = W.tensor[i,,] }else if(m == 3){ B.hat[,,,i] = W.tensor[i,,,] } } if(is.symmetric == TRUE && m==2){ ## in matrix symmetric output case, use B.hat.new = [B.hat + t(B.hta)]/2 B.hat.new = B.hat for(jj in 1:p){ B.hat.new[,,jj] = (B.hat[,,jj] + t(B.hat[,,jj]))/2 } B.hat = B.hat.new } B.hat } ####################################################################################################################################### ## 3-fold CV Tuning of Higher-order low-rank regression (HOLLR) by Rabusseau and Kadri (2016, NIPS) ## Set all ranks in tucker to be same for fair comparison of our CP decomposition. ####################################################################################################################################### myholrrcv <- function(Y, X, R.list = NULL, gamma.list = exp(seq(-3,3, length=5)), seed = 1, is.symmetric = FALSE){ # Y: a list of response tensor Y = {Y_1, ..., Y_n} wit Y[[i]] = Y_i a tensor of dimension d1 * ... * dm # X \in R^{n * p}: covariate X = (x_1, ..., x_n)^T with x_i a vector of dimension p # If R.list = NULL, use the range of (1, ..., \min{d1, ..., dm}) with length = 5. # gamma.list: list of tuning parameters in ridge regression constraint. m = length(dim(Y[[1]])) X = as.matrix(X) n = nrow(X) p = ncol(X) DD = c(lapply(Y,dim)[[1]], p) # DD = (d_1, ..., d_m, p) if( is.null(R.list)){ R.max = min(lapply(Y,dim)[[1]]) R.list = as.integer(seq(1, R.max, length = 5)) } k1 = length(R.list) lambda1.v = sort(R.list)[k1:1] k2 = length(gamma.list) lambda2.v = sort(gamma.list)[k2:1] l1.index <- as.vector(matrix(lambda1.v, nrow = k1, ncol = k1, byrow = FALSE)) l2.index <- as.vector(matrix(lambda2.v, nrow = k2, ncol = k2, byrow = TRUE)) l.index <- rbind(l1.index, l2.index) set.seed(seed) index.cv <- NULL fold = 3 ran.order = sample(1:n, n, replace = F) f.n = floor(n/fold) for (f in 1:(fold - 1)) { index.cv[[f]] <- ran.order[(f - 1) * f.n + 1:f.n] } index.cv[[fold]] <- ran.order[((fold - 1) * f.n + 1):n] rss.cv.cv <- NULL for (f in 1:fold) { index.cur.cv <- index.cv[[f]] X.m <- as.matrix(X[-(index.cur.cv), ]) Y.m <- Y[-(index.cur.cv)] X.t <- as.matrix(X[index.cur.cv, ]) Y.t <- Y[index.cur.cv] rss.cv.c <- matrix(0, k1, k2) for (i in 1:k1) { for (j in 1:k2) { cur.lam1 = lambda1.v[i] cur.lam2 = lambda2.v[j] Bhat.temp = myholrr(Y.m, X.m, R.vec = c(1, rep(cur.lam1, m)), gamma = cur.lam2, is.symmetric = is.symmetric) Y.predict = (ttm(as.tensor(Bhat.temp), X.t, m+1))@data Y.t.array = array(0, c(lapply(Y,dim)[[1]], length(Y.t)) ) for(k in 1:length(Y.t)){ if(m == 2){ Y.t.array[,,k] = Y.t[[k]] }else if(m == 3){ Y.t.array[,,,k] = Y.t[[k]] } } rss.cv.c[i,j] = sum((Y.predict - Y.t.array)^2) } } rss.cv.cv[[f]] <- rss.cv.c } rss.cv.sum <- array(0, c(k1, k2, fold)) for (f in (1:fold)) { rss.cv.sum[,,f]<- rss.cv.cv[[f]] } rss.cv <- apply(rss.cv.sum, c(1, 2), sum) rownames(rss.cv) = paste("lamL1=", round(lambda1.v, 3)) colnames(rss.cv) = paste("lamL2=", round(lambda2.v, 3)) opt.R.vec = c(1, rep(l.index[1,which.min(rss.cv)[1]], m)) opt.gamma = l.index[2,which.min(rss.cv)[1]] result = list(opt.R.vec = opt.R.vec, opt.gamma = opt.gamma, rss.cv = rss.cv, l.index = l.index) return(result) }